Skip to content

Commit 53faf3c

Browse files
authored
Merge branch 'pytorch:main' into boyuc/observatory_draft_demo
2 parents 2f5a882 + 0a113f8 commit 53faf3c

1,744 files changed

Lines changed: 170363 additions & 25348 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.ci/docker/build.sh

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#!/bin/bash
22
# Copyright (c) Meta Platforms, Inc. and affiliates.
33
# All rights reserved.
4+
# Copyright 2026 Arm Limited and/or its affiliates.
45
#
56
# This source code is licensed under the BSD-style license found in the
67
# LICENSE file in the root directory of this source tree.
@@ -91,14 +92,21 @@ esac
9192
TORCH_VERSION=$(cat ci_commit_pins/pytorch.txt)
9293
BUILD_DOCS=1
9394

95+
# Pull channel + spec/url helpers out of torch_pin.py so install_pytorch.sh
96+
# (which runs inside the docker build, where torch_pin.py isn't available)
97+
# can decide between wheel install (test/release) and source build (nightly).
98+
# Self-hosted runners often have python3 but not the unversioned python alias.
99+
PYTHON_BIN=$(command -v python3 || command -v python)
100+
TORCH_PIN_HELPERS=$(cd ../.. && "$PYTHON_BIN" -c "from torch_pin import CHANNEL, torch_spec, torchaudio_spec, torchvision_spec, torch_index_url_base; print(CHANNEL); print(torch_spec()); print(torchaudio_spec()); print(torchvision_spec()); print(torch_index_url_base())")
101+
TORCH_CHANNEL=$(echo "${TORCH_PIN_HELPERS}" | sed -n '1p')
102+
TORCH_SPEC=$(echo "${TORCH_PIN_HELPERS}" | sed -n '2p')
103+
TORCHAUDIO_SPEC=$(echo "${TORCH_PIN_HELPERS}" | sed -n '3p')
104+
TORCHVISION_SPEC=$(echo "${TORCH_PIN_HELPERS}" | sed -n '4p')
105+
TORCH_INDEX_URL=$(echo "${TORCH_PIN_HELPERS}" | sed -n '5p')
106+
94107
# Copy requirements-lintrunner.txt from root to here
95108
cp ../../requirements-lintrunner.txt ./
96109

97-
# Copy arm setup script from root to here
98-
# TODO(huydhn): Figure out a way to rebuild the Docker image automatically
99-
# with a new image hash when the content here is updated
100-
cp -r ../../examples/arm/ ./arm
101-
102110
docker build \
103111
--no-cache \
104112
--progress=plain \
@@ -108,6 +116,11 @@ docker build \
108116
--build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \
109117
--build-arg "MINICONDA_VERSION=${MINICONDA_VERSION}" \
110118
--build-arg "TORCH_VERSION=${TORCH_VERSION}" \
119+
--build-arg "TORCH_CHANNEL=${TORCH_CHANNEL}" \
120+
--build-arg "TORCH_SPEC=${TORCH_SPEC}" \
121+
--build-arg "TORCHAUDIO_SPEC=${TORCHAUDIO_SPEC}" \
122+
--build-arg "TORCHVISION_SPEC=${TORCHVISION_SPEC}" \
123+
--build-arg "TORCH_INDEX_URL=${TORCH_INDEX_URL}" \
111124
--build-arg "BUCK2_VERSION=${BUCK2_VERSION}" \
112125
--build-arg "LINTRUNNER=${LINTRUNNER:-}" \
113126
--build-arg "BUILD_DOCS=${BUILD_DOCS}" \
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
a9592258daacad7423fd5f39aaa59c6e36471520
1+
585799cf7039d376d2ac4848b5ef0b501f60679e
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
659af3c353e49b35c191cdd2dba3b3c79d0e6822
1+
release/2.11

.ci/docker/common/install_cuda.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,13 +38,15 @@ apt-get update
3838
# - libcublas-dev: cuBLAS development files
3939
# - libcusparse-dev: cuSPARSE development files
4040
# - libcufft-dev: cuFFT development files
41+
# - libcurand-dev: cuRAND development files
4142
apt-get install -y --no-install-recommends \
4243
"cuda-nvcc-${CUDA_VERSION_DASH}" \
4344
"cuda-cudart-dev-${CUDA_VERSION_DASH}" \
4445
"cuda-nvrtc-dev-${CUDA_VERSION_DASH}" \
4546
"libcublas-dev-${CUDA_VERSION_DASH}" \
4647
"libcusparse-dev-${CUDA_VERSION_DASH}" \
47-
"libcufft-dev-${CUDA_VERSION_DASH}"
48+
"libcufft-dev-${CUDA_VERSION_DASH}" \
49+
"libcurand-dev-${CUDA_VERSION_DASH}"
4850

4951
# Clean up
5052
apt-get clean

.ci/docker/common/install_cuda_windows_cross_compile.sh

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -48,20 +48,23 @@ get_torch_cuda_version() {
4848
}
4949

5050
install_windows_cuda() {
51-
# Get CUDA version from torch
52-
TORCH_CUDA_VERSION=$(get_torch_cuda_version)
51+
# Use CUDA_VERSION env var if set (from Docker build arg), otherwise query PyTorch
52+
if [ -n "${CUDA_VERSION:-}" ]; then
53+
echo "Using CUDA version from environment: ${CUDA_VERSION}"
54+
CUDA_MAJOR_MINOR=$(echo "${CUDA_VERSION}" | cut -d. -f1,2)
55+
else
56+
TORCH_CUDA_VERSION=$(get_torch_cuda_version)
57+
58+
if [ -z "${TORCH_CUDA_VERSION}" ] || [ "${TORCH_CUDA_VERSION}" = "None" ]; then
59+
echo "ERROR: Could not detect CUDA version from PyTorch."
60+
echo "Make sure PyTorch with CUDA support is installed or set CUDA_VERSION."
61+
exit 1
62+
fi
5363

54-
if [ -z "${TORCH_CUDA_VERSION}" ] || [ "${TORCH_CUDA_VERSION}" = "None" ]; then
55-
echo "ERROR: Could not detect CUDA version from PyTorch."
56-
echo "Make sure PyTorch with CUDA support is installed before running this script."
57-
exit 1
64+
echo "Detected PyTorch CUDA version: ${TORCH_CUDA_VERSION}"
65+
CUDA_MAJOR_MINOR=$(echo "${TORCH_CUDA_VERSION}" | cut -d. -f1,2)
5866
fi
5967

60-
echo "Detected PyTorch CUDA version: ${TORCH_CUDA_VERSION}"
61-
62-
# Extract major.minor version (e.g., "12.8" from "12.8.1" or "12.8")
63-
CUDA_MAJOR_MINOR=$(echo "${TORCH_CUDA_VERSION}" | cut -d. -f1,2)
64-
6568
# Look up the full version and driver version
6669
if [ -z "${CUDA_DRIVER_MAP[${CUDA_MAJOR_MINOR}]}" ]; then
6770
echo "ERROR: CUDA version ${CUDA_MAJOR_MINOR} is not in the known version map."

.ci/docker/common/install_pytorch.sh

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,24 @@ install_domains() {
1717
}
1818

1919
install_pytorch_and_domains() {
20+
if [ "${TORCH_CHANNEL}" != "nightly" ]; then
21+
# Test/release: install the published wheels directly. The specs and URL
22+
# are passed in as docker build args (computed from torch_pin.py by
23+
# .ci/docker/build.sh). RC wheels at /whl/test/ get re-uploaded under the
24+
# same version, so use --no-cache-dir there to avoid stale cache hits.
25+
local cache_flag=""
26+
if [ "${TORCH_CHANNEL}" = "test" ]; then
27+
cache_flag="--no-cache-dir"
28+
fi
29+
pip_install --force-reinstall ${cache_flag} \
30+
"${TORCH_SPEC}" "${TORCHVISION_SPEC}" "${TORCHAUDIO_SPEC}" \
31+
--index-url "${TORCH_INDEX_URL}/cpu"
32+
return
33+
fi
34+
35+
# Nightly: build pytorch from source against the pinned SHA in pytorch.txt
36+
# so we catch upstream regressions, then install audio/vision from the
37+
# commits that pytorch itself pins.
2038
git clone https://github.com/pytorch/pytorch.git
2139

2240
# Fetch the target commit
@@ -27,11 +45,16 @@ install_pytorch_and_domains() {
2745
chown -R ci-user .
2846

2947
export _GLIBCXX_USE_CXX11_ABI=1
48+
# PyTorch's FindARM.cmake hard-fails when the SVE+BF16 compile probe
49+
# doesn't pass — gcc-11 in this image is too old to accept the combined
50+
# NEON/SVE/bfloat16 intrinsics the probe exercises. Executorch's aarch64
51+
# runtime targets (phones, embedded) don't use SVE, so bypass the check.
52+
export BUILD_IGNORE_SVE_UNAVAILABLE=1
3053
# Then build and install PyTorch
3154
conda_run python setup.py bdist_wheel
3255
pip_install "$(echo dist/*.whl)"
3356

34-
# Grab the pinned audio and vision commits from PyTorch
57+
# Defer to PyTorch's own pinned audio/vision commits.
3558
TORCHAUDIO_VERSION=$(cat .github/ci_commit_pins/audio.txt)
3659
export TORCHAUDIO_VERSION
3760
TORCHVISION_VERSION=$(cat .github/ci_commit_pins/vision.txt)

.ci/docker/ubuntu/Dockerfile

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,11 @@ ENV SCCACHE_S3_KEY_PREFIX executorch
6464
ENV SCCACHE_REGION us-east-1
6565

6666
ARG TORCH_VERSION
67+
ARG TORCH_CHANNEL
68+
ARG TORCH_SPEC
69+
ARG TORCHAUDIO_SPEC
70+
ARG TORCHVISION_SPEC
71+
ARG TORCH_INDEX_URL
6772
ARG SKIP_PYTORCH
6873
COPY ./common/install_pytorch.sh install_pytorch.sh
6974
COPY ./common/utils.sh utils.sh
@@ -105,7 +110,7 @@ COPY ./common/install_cuda_windows_cross_compile.sh install_cuda_windows_cross_c
105110
COPY ./common/utils.sh utils.sh
106111
RUN if [ -n "${CUDA_WINDOWS_CROSS_COMPILE}" ]; then \
107112
CUDA_VERSION=${CUDA_VERSION} bash ./install_cuda.sh && \
108-
bash ./install_cuda_windows_cross_compile.sh; \
113+
CUDA_VERSION=${CUDA_VERSION} bash ./install_cuda_windows_cross_compile.sh; \
109114
fi
110115
RUN rm -f install_cuda.sh install_cuda_windows_cross_compile.sh utils.sh
111116
# Set up CUDA environment for Linux compilation (nvcc, etc.)

.ci/scripts/build-qnn-sdk.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ set_up_aot() {
4040
-DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
4141
-DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
4242
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
43+
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
44+
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED_AOT=ON \
4345
-DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
4446
-DPYTHON_EXECUTABLE=python3
4547
cmake --build $PWD --target "PyQnnManagerAdaptor" -j$(nproc)

.ci/scripts/cuda_benchmark.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@ class RunMetrics:
1818
"""Metrics from a single run."""
1919

2020
generated_tokens: int
21+
prompt_tokens: int
2122
tokens_per_sec: float
23+
prefill_tokens_per_sec: float
2224
model_load_time_ms: float
2325
total_inference_time_ms: float
2426
encoder_time_ms: float
@@ -28,7 +30,8 @@ class RunMetrics:
2830
def __repr__(self):
2931
return (
3032
f"Tokens: {self.generated_tokens}, "
31-
f"Throughput: {self.tokens_per_sec:.2f} t/s, "
33+
f"Prefill: {self.prefill_tokens_per_sec:.2f} t/s ({self.prompt_tokens} tokens), "
34+
f"Decode: {self.tokens_per_sec:.2f} t/s, "
3235
f"Model load: {self.model_load_time_ms:.0f}ms, "
3336
f"Total inference: {self.total_inference_time_ms:.0f}ms, "
3437
f"Encoder: {self.encoder_time_ms:.0f}ms, "
@@ -49,6 +52,7 @@ def parse_pytorch_observer_log(log_line: str) -> Optional[RunMetrics]:
4952

5053
# Extract values
5154
generated_tokens = data.get("generated_tokens", 0)
55+
prompt_tokens = data.get("prompt_tokens", 0)
5256
inference_start_ms = data.get("inference_start_ms", 0)
5357
inference_end_ms = data.get("inference_end_ms", 0)
5458
prompt_eval_end_ms = data.get("prompt_eval_end_ms", 0)
@@ -72,12 +76,20 @@ def parse_pytorch_observer_log(log_line: str) -> Optional[RunMetrics]:
7276
if generation_time_ms > 0
7377
else 0
7478
)
79+
80+
# Calculate prefill throughput
81+
prefill_tokens_per_sec = (
82+
(prompt_tokens / encoder_time_ms * 1000) if encoder_time_ms > 0 else 0
83+
)
84+
7585
model_load_time_ms = model_load_end_ms - model_load_start_ms
7686
first_token_latency_ms = first_token_ms - prompt_eval_end_ms
7787

7888
return RunMetrics(
7989
generated_tokens=generated_tokens,
90+
prompt_tokens=prompt_tokens,
8091
tokens_per_sec=tokens_per_sec,
92+
prefill_tokens_per_sec=prefill_tokens_per_sec,
8193
model_load_time_ms=model_load_time_ms,
8294
total_inference_time_ms=total_inference_time_ms,
8395
encoder_time_ms=encoder_time_ms,
@@ -505,6 +517,7 @@ class BenchmarkResults:
505517

506518
# Metrics
507519
throughput: MetricStats
520+
prefill_throughput: MetricStats
508521
model_load_time: MetricStats
509522
total_inference_time: MetricStats
510523
encoder_time: MetricStats
@@ -529,6 +542,10 @@ def to_dict(self) -> dict:
529542
"throughput_min": self.throughput.min_val,
530543
"throughput_max": self.throughput.max_val,
531544
"throughput_stdev": self.throughput.stdev,
545+
"prefill_throughput_mean": self.prefill_throughput.mean,
546+
"prefill_throughput_min": self.prefill_throughput.min_val,
547+
"prefill_throughput_max": self.prefill_throughput.max_val,
548+
"prefill_throughput_stdev": self.prefill_throughput.stdev,
532549
"model_load_time_mean": self.model_load_time.mean,
533550
"model_load_time_min": self.model_load_time.min_val,
534551
"model_load_time_max": self.model_load_time.max_val,
@@ -601,6 +618,13 @@ def to_v3_format(
601618
runner_type,
602619
base_extra_info,
603620
),
621+
self.prefill_throughput.create_v3_record(
622+
model_name_with_quant,
623+
backend,
624+
runner_name,
625+
runner_type,
626+
base_extra_info,
627+
),
604628
self.model_load_time.create_v3_record(
605629
model_name_with_quant,
606630
backend,
@@ -696,6 +720,11 @@ def create_metric_stats(
696720
"t/s",
697721
{"trimmed_runs": len(trimmed_throughput)},
698722
),
723+
prefill_throughput=create_metric_stats(
724+
"prefill_encoder_throughput(tokens/sec)",
725+
[r.prefill_tokens_per_sec for r in results],
726+
"t/s",
727+
),
699728
model_load_time=create_metric_stats(
700729
"model_load_time(ms)",
701730
[r.model_load_time_ms for r in results],
@@ -740,6 +769,7 @@ def print_summary(summary: BenchmarkResults) -> None:
740769

741770
# Print all metrics using their print_stats method
742771
summary.throughput.print_stats()
772+
summary.prefill_throughput.print_stats()
743773
summary.model_load_time.print_stats()
744774
summary.total_inference_time.print_stats()
745775
summary.encoder_time.print_stats()

0 commit comments

Comments
 (0)