Skip to content

Commit 855f75e

Browse files
committed
Update on "[ET Device Support] Annotate device attributes of CUDA backend IO tensors cuda device"
Update cuda backend partitioner to annotate its IO tensors as cuda device Differential Revision: [D96010436](https://our.internmc.facebook.com/intern/diff/D96010436/) [ghstack-poisoned]
2 parents f5f20d9 + 7008e9b commit 855f75e

873 files changed

Lines changed: 68294 additions & 8842 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.ci/docker/build.sh

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#!/bin/bash
22
# Copyright (c) Meta Platforms, Inc. and affiliates.
33
# All rights reserved.
4+
# Copyright 2026 Arm Limited and/or its affiliates.
45
#
56
# This source code is licensed under the BSD-style license found in the
67
# LICENSE file in the root directory of this source tree.
@@ -94,11 +95,6 @@ BUILD_DOCS=1
9495
# Copy requirements-lintrunner.txt from root to here
9596
cp ../../requirements-lintrunner.txt ./
9697

97-
# Copy arm setup script from root to here
98-
# TODO(huydhn): Figure out a way to rebuild the Docker image automatically
99-
# with a new image hash when the content here is updated
100-
cp -r ../../examples/arm/ ./arm
101-
10298
docker build \
10399
--no-cache \
104100
--progress=plain \

.ci/docker/common/install_cuda.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,13 +38,15 @@ apt-get update
3838
# - libcublas-dev: cuBLAS development files
3939
# - libcusparse-dev: cuSPARSE development files
4040
# - libcufft-dev: cuFFT development files
41+
# - libcurand-dev: cuRAND development files
4142
apt-get install -y --no-install-recommends \
4243
"cuda-nvcc-${CUDA_VERSION_DASH}" \
4344
"cuda-cudart-dev-${CUDA_VERSION_DASH}" \
4445
"cuda-nvrtc-dev-${CUDA_VERSION_DASH}" \
4546
"libcublas-dev-${CUDA_VERSION_DASH}" \
4647
"libcusparse-dev-${CUDA_VERSION_DASH}" \
47-
"libcufft-dev-${CUDA_VERSION_DASH}"
48+
"libcufft-dev-${CUDA_VERSION_DASH}" \
49+
"libcurand-dev-${CUDA_VERSION_DASH}"
4850

4951
# Clean up
5052
apt-get clean

.ci/scripts/cuda_benchmark.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@ class RunMetrics:
1818
"""Metrics from a single run."""
1919

2020
generated_tokens: int
21+
prompt_tokens: int
2122
tokens_per_sec: float
23+
prefill_tokens_per_sec: float
2224
model_load_time_ms: float
2325
total_inference_time_ms: float
2426
encoder_time_ms: float
@@ -28,7 +30,8 @@ class RunMetrics:
2830
def __repr__(self):
2931
return (
3032
f"Tokens: {self.generated_tokens}, "
31-
f"Throughput: {self.tokens_per_sec:.2f} t/s, "
33+
f"Prefill: {self.prefill_tokens_per_sec:.2f} t/s ({self.prompt_tokens} tokens), "
34+
f"Decode: {self.tokens_per_sec:.2f} t/s, "
3235
f"Model load: {self.model_load_time_ms:.0f}ms, "
3336
f"Total inference: {self.total_inference_time_ms:.0f}ms, "
3437
f"Encoder: {self.encoder_time_ms:.0f}ms, "
@@ -49,6 +52,7 @@ def parse_pytorch_observer_log(log_line: str) -> Optional[RunMetrics]:
4952

5053
# Extract values
5154
generated_tokens = data.get("generated_tokens", 0)
55+
prompt_tokens = data.get("prompt_tokens", 0)
5256
inference_start_ms = data.get("inference_start_ms", 0)
5357
inference_end_ms = data.get("inference_end_ms", 0)
5458
prompt_eval_end_ms = data.get("prompt_eval_end_ms", 0)
@@ -72,12 +76,20 @@ def parse_pytorch_observer_log(log_line: str) -> Optional[RunMetrics]:
7276
if generation_time_ms > 0
7377
else 0
7478
)
79+
80+
# Calculate prefill throughput
81+
prefill_tokens_per_sec = (
82+
(prompt_tokens / encoder_time_ms * 1000) if encoder_time_ms > 0 else 0
83+
)
84+
7585
model_load_time_ms = model_load_end_ms - model_load_start_ms
7686
first_token_latency_ms = first_token_ms - prompt_eval_end_ms
7787

7888
return RunMetrics(
7989
generated_tokens=generated_tokens,
90+
prompt_tokens=prompt_tokens,
8091
tokens_per_sec=tokens_per_sec,
92+
prefill_tokens_per_sec=prefill_tokens_per_sec,
8193
model_load_time_ms=model_load_time_ms,
8294
total_inference_time_ms=total_inference_time_ms,
8395
encoder_time_ms=encoder_time_ms,
@@ -505,6 +517,7 @@ class BenchmarkResults:
505517

506518
# Metrics
507519
throughput: MetricStats
520+
prefill_throughput: MetricStats
508521
model_load_time: MetricStats
509522
total_inference_time: MetricStats
510523
encoder_time: MetricStats
@@ -529,6 +542,10 @@ def to_dict(self) -> dict:
529542
"throughput_min": self.throughput.min_val,
530543
"throughput_max": self.throughput.max_val,
531544
"throughput_stdev": self.throughput.stdev,
545+
"prefill_throughput_mean": self.prefill_throughput.mean,
546+
"prefill_throughput_min": self.prefill_throughput.min_val,
547+
"prefill_throughput_max": self.prefill_throughput.max_val,
548+
"prefill_throughput_stdev": self.prefill_throughput.stdev,
532549
"model_load_time_mean": self.model_load_time.mean,
533550
"model_load_time_min": self.model_load_time.min_val,
534551
"model_load_time_max": self.model_load_time.max_val,
@@ -601,6 +618,13 @@ def to_v3_format(
601618
runner_type,
602619
base_extra_info,
603620
),
621+
self.prefill_throughput.create_v3_record(
622+
model_name_with_quant,
623+
backend,
624+
runner_name,
625+
runner_type,
626+
base_extra_info,
627+
),
604628
self.model_load_time.create_v3_record(
605629
model_name_with_quant,
606630
backend,
@@ -696,6 +720,11 @@ def create_metric_stats(
696720
"t/s",
697721
{"trimmed_runs": len(trimmed_throughput)},
698722
),
723+
prefill_throughput=create_metric_stats(
724+
"prefill_encoder_throughput(tokens/sec)",
725+
[r.prefill_tokens_per_sec for r in results],
726+
"t/s",
727+
),
699728
model_load_time=create_metric_stats(
700729
"model_load_time(ms)",
701730
[r.model_load_time_ms for r in results],
@@ -740,6 +769,7 @@ def print_summary(summary: BenchmarkResults) -> None:
740769

741770
# Print all metrics using their print_stats method
742771
summary.throughput.print_stats()
772+
summary.prefill_throughput.print_stats()
743773
summary.model_load_time.print_stats()
744774
summary.total_inference_time.print_stats()
745775
summary.encoder_time.print_stats()
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
Please analyze and summarize the following text in detail:
2+
3+
The Transformer architecture, introduced by Vaswani et al. in the 2017 paper "Attention Is All You Need," fundamentally reshaped the landscape of natural language processing and, more broadly, machine learning. Prior to its introduction, sequence modeling had been dominated by recurrent neural networks such as the Long Short-Term Memory (LSTM) and the Gated Recurrent Unit (GRU), as well as by convolutional approaches that attempted to capture local context efficiently. While these architectures achieved respectable results on a variety of tasks, they suffered from inherent limitations: recurrent computations could not be easily parallelized along the time dimension, gradients tended to vanish or explode across very long sequences, and convolutional models had difficulty capturing global dependencies without resorting to deep stacks of layers or specialized dilated kernels.
4+
5+
The Transformer addressed these limitations by replacing recurrence with a mechanism known as self-attention. In self-attention, every token in a sequence computes a weighted sum of representations from every other token, where the weights are determined by a compatibility function between learned queries and keys. This formulation has several appealing properties. First, the computation across positions is fully parallelizable, which dramatically accelerates training on modern accelerator hardware such as graphics processing units and tensor processing units. Second, the path length between any two tokens is constant, allowing the model to capture long-range dependencies far more directly than recurrent networks ever could. Third, the attention weights themselves carry interpretable information about which parts of the input the model is using to produce each output, offering a window into model behavior that earlier architectures rarely provided.
6+
7+
A standard Transformer block stacks multi-head self-attention with a position-wise feed-forward network, surrounded by residual connections and layer normalization. The multi-head design allows the model to attend to information from multiple representation subspaces simultaneously: each head learns its own projection of the input into queries, keys, and values, computes its own attention pattern, and contributes a slice of the final output. Empirically, different heads often specialize in different linguistic phenomena, with some focusing on syntactic relationships such as subject-verb agreement, others on coreference, and still others on broader semantic context.
8+
9+
Because self-attention is permutation invariant, the Transformer must inject information about the order of tokens explicitly. The original paper used fixed sinusoidal positional encodings added to the token embeddings, but subsequent work has explored a wide variety of alternatives. Learned positional embeddings, relative position representations, rotary positional embeddings, and attention-with-linear-biases schemes have all been proposed and adopted in different families of models. Each approach offers a different trade-off between expressiveness, generalization to longer contexts, and computational cost.
10+
11+
The original Transformer was an encoder-decoder model designed for machine translation, but the architecture quickly proved general enough to power a remarkable range of subsequent systems. Encoder-only models such as BERT framed pretraining as a masked language modeling task and dramatically improved performance on classification, question answering, and information retrieval. Decoder-only models such as the GPT family treated language modeling as next-token prediction at very large scale, and demonstrated that with sufficient data and compute, a single architecture could exhibit emergent capabilities such as few-shot learning, code generation, and chain-of-thought reasoning. Encoder-decoder variants such as T5 and BART unified many tasks under a common text-to-text framing, simplifying the engineering of multitask systems.
12+
13+
Scaling laws, established through systematic empirical study, have shown that Transformer performance improves predictably as a function of model parameters, dataset size, and compute budget. This insight motivated a wave of increasingly large models, culminating in dense networks with hundreds of billions of parameters and sparsely activated mixture-of-experts networks with trillions. Mixture-of-experts approaches in particular have become attractive because they decouple total parameter count from the per-token computation cost: a router network selects a small subset of experts for each token, allowing the model to grow in capacity without a proportional increase in inference latency. This trade-off makes mixture-of-experts especially appealing for deployment scenarios where memory bandwidth dominates compute as the bottleneck.
14+
15+
Beyond text, the Transformer has been adapted to images, audio, video, proteins, source code, and combinations thereof. Vision Transformers split images into patches and treat each patch as a token; speech models tokenize raw audio or spectrogram features; multimodal systems share a common Transformer backbone across modalities by aligning their embedding spaces. The flexibility of the architecture, combined with the practical advantages of large-scale pretraining followed by task-specific fine-tuning or in-context prompting, has made the Transformer the de facto foundation of contemporary deep learning research and applications.
16+
17+
Of course, the Transformer is not without its drawbacks. The quadratic memory and compute cost of standard self-attention with respect to sequence length remains a significant practical limitation. A vibrant subfield of research focuses on efficient attention variants, including sparse, low-rank, and kernelized formulations, as well as state-space models that recover linear complexity while preserving competitive accuracy. Long-context inference also stresses the key-value cache, motivating techniques such as paged attention, grouped-query attention, sliding-window attention, and aggressive quantization. Quantization in particular has become a critical tool for deploying large models on commodity hardware, with low-bit integer formats and weight-only quantization schemes enabling inference on consumer GPUs and edge devices that would otherwise be unable to host a model.
18+
19+
Tooling has co-evolved with the architecture. Compilers and runtimes such as ExecuTorch, TensorRT, vLLM, and various ONNX-based stacks specialize in lowering Transformer graphs onto target accelerators while applying optimizations such as kernel fusion, operator scheduling, and memory planning. These systems make it feasible to take a research model trained in a high-level framework and deploy it efficiently on production hardware ranging from data center GPUs to mobile system-on-chips. The end-to-end pipeline of training, fine-tuning, quantization, export, and runtime execution has become a recognizable engineering discipline in its own right.
20+
21+
Training a frontier-scale Transformer is itself a substantial systems undertaking. Modern pretraining runs combine data parallelism, tensor parallelism, pipeline parallelism, and sequence parallelism, often coordinated through libraries such as PyTorch FSDP, Megatron, and DeepSpeed ZeRO. Practitioners must balance compute and memory carefully, choosing micro-batch sizes that maximize accelerator utilization without exceeding device memory, designing checkpointing schemes that survive node failures over runs that can last for months, and overlapping communication with computation to hide network latency. Activation checkpointing trades extra computation for reduced memory pressure, while mixed precision training with bfloat16 or FP8 formats shrinks memory bandwidth requirements and unlocks newer hardware features.
22+
23+
Inference brings its own set of challenges. The autoregressive nature of decoder-only Transformers means each generated token requires a full forward pass, and the dominant cost shifts from raw matrix multiplication during prefill to memory-bandwidth-bound key-value cache reads during decode. Techniques such as speculative decoding, continuous batching, and prefix caching attempt to claw back utilization. For latency-sensitive deployments, careful kernel fusion, paged attention, and ahead-of-time compilation can reduce per-token overhead substantially, and the rise of small distilled or sparsely activated models offers an alternative path to acceptable quality at a fraction of the cost.
24+
25+
Looking ahead, the Transformer's dominance is being challenged by alternative architectures such as state-space models, linear recurrent networks, and hybrid designs that interleave attention with other mixing primitives. Whether any of these will displace the Transformer entirely remains to be seen, but it is already clear that the ideas the Transformer popularized — content-based mixing of tokens, parallelizable training, and large-scale pretraining followed by adaptation — will continue to shape the field for years to come.

.ci/scripts/download_hf_hub.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
#!/bin/bash
22

3+
# Disable HF Xet storage to avoid stalled downloads on CI runners
4+
export HF_HUB_DISABLE_XET=1
5+
36
# Function to download files from the Hugging Face Hub
47
# Arguments:
58
# 1. model_id: The Hugging Face repository ID (e.g., "organization/model_name")

.ci/scripts/export_model_artifact.sh

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,9 @@ if [ -z "${1:-}" ]; then
6767
exit 1
6868
fi
6969

70+
# Disable HF Xet storage to avoid stalled downloads on CI runners
71+
export HF_HUB_DISABLE_XET=1
72+
7073
set -eux
7174

7275
DEVICE="$1"
@@ -415,12 +418,40 @@ if [ "$MODEL_NAME" = "qwen3_5_moe" ]; then
415418

416419
# Export to .pte/.ptd (short cache dir avoids objcopy symbol length issues)
417420
echo "::group::Export"
421+
EXPORT_LOG=$(mktemp)
418422
TORCHINDUCTOR_CACHE_DIR="$INDUCTOR_CACHE" \
419423
python -m executorch.examples.models.qwen3_5_moe.export \
420424
--prequantized "$LOCAL_MODEL_DIR" \
421-
--output-dir "${OUTPUT_DIR}"
425+
--output-dir "${OUTPUT_DIR}" \
426+
--dense-prefill dequant \
427+
--moe-activation-dtype int8 2>&1 | tee "$EXPORT_LOG"
428+
EXPORT_RC=${PIPESTATUS[0]}
422429
echo "::endgroup::"
423430

431+
if [ "$EXPORT_RC" -ne 0 ]; then
432+
echo "ERROR: Qwen3.5 MoE export failed (exit $EXPORT_RC)"
433+
rm -f "$EXPORT_LOG"
434+
exit "$EXPORT_RC"
435+
fi
436+
437+
# Gate peak GPU memory so we keep the export viable on consumer GPUs
438+
# (e.g. RTX 4090 with 24 GB). The export script prints a machine-
439+
# parseable marker line "EXPORT_GPU_PEAK_MEMORY_MB: <float>".
440+
EXPORT_GPU_PEAK_MB_LIMIT="${EXPORT_GPU_PEAK_MB_LIMIT:-20480}"
441+
PEAK_LINE=$(grep -E '^EXPORT_GPU_PEAK_MEMORY_MB:' "$EXPORT_LOG" | tail -1)
442+
rm -f "$EXPORT_LOG"
443+
if [ -z "$PEAK_LINE" ]; then
444+
echo "ERROR: export did not emit EXPORT_GPU_PEAK_MEMORY_MB marker; cannot enforce GPU memory budget"
445+
exit 1
446+
fi
447+
PEAK_MB=$(echo "$PEAK_LINE" | awk '{print $2}')
448+
echo "Export GPU peak memory: ${PEAK_MB} MB (limit ${EXPORT_GPU_PEAK_MB_LIMIT} MB)"
449+
if awk -v p="$PEAK_MB" -v l="$EXPORT_GPU_PEAK_MB_LIMIT" 'BEGIN{exit !(p>l)}'; then
450+
echo "ERROR: export exceeded GPU memory budget (${PEAK_MB} MB > ${EXPORT_GPU_PEAK_MB_LIMIT} MB)"
451+
echo " — this would prevent the model from being exported on a 24 GB consumer GPU."
452+
exit 1
453+
fi
454+
424455
test -f "${OUTPUT_DIR}/model.pte"
425456
test -f "${OUTPUT_DIR}/aoti_cuda_blob.ptd"
426457
ls -al "${OUTPUT_DIR}"

.ci/scripts/test_backend.sh

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ export PYTHON_EXECUTABLE=python
3535

3636
# CMake options to use, in addition to the defaults.
3737
EXTRA_BUILD_ARGS=""
38+
PYTEST_RETRY_ARGS=()
3839

3940
if [[ "$FLOW" == *qnn* ]]; then
4041
# Setup QNN sdk and deps - note that this is a bit hacky due to the nature of the
@@ -57,6 +58,9 @@ if [[ "$FLOW" == *vulkan* ]]; then
5758
fi
5859

5960
if [[ "$FLOW" == *arm* ]]; then
61+
if [[ "$SUITE" == "operators" ]]; then
62+
PYTEST_RETRY_ARGS=(--reruns 2 --reruns-delay 1)
63+
fi
6064

6165
# Setup ARM deps.
6266
if [[ "$FLOW" == *vgf* ]]; then
@@ -95,6 +99,11 @@ GOLDEN_DIR="${ARTIFACT_DIR}/golden-artifacts"
9599
export GOLDEN_ARTIFACTS_DIR="${GOLDEN_DIR}"
96100

97101
EXIT_CODE=0
98-
${CONDA_RUN_CMD} pytest -c /dev/null -n auto backends/test/suite/$SUITE/ -m flow_$FLOW --json-report --json-report-file="$REPORT_FILE" || EXIT_CODE=$?
102+
PYTEST_ARGS=(-c /dev/null -n auto)
103+
if [[ ${#PYTEST_RETRY_ARGS[@]} -gt 0 ]]; then
104+
PYTEST_ARGS+=("${PYTEST_RETRY_ARGS[@]}")
105+
fi
106+
PYTEST_ARGS+=("backends/test/suite/$SUITE/" -m "flow_$FLOW" --json-report --json-report-file="$REPORT_FILE")
107+
${CONDA_RUN_CMD} pytest "${PYTEST_ARGS[@]}" || EXIT_CODE=$?
99108
# Generate markdown summary.
100109
${CONDA_RUN_CMD} python -m executorch.backends.test.suite.generate_markdown_summary_json "$REPORT_FILE" > ${GITHUB_STEP_SUMMARY:-"step_summary.md"} --exit-code $EXIT_CODE

0 commit comments

Comments
 (0)