Skip to content

Commit 9ced5cc

Browse files
authored
Merge branch 'main' into gasoonjia/flashdecoding-pp-async-softmax
2 parents b62ed91 + c48ea12 commit 9ced5cc

310 files changed

Lines changed: 18704 additions & 3185 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.ci/scripts/cuda_benchmark.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@ class RunMetrics:
1818
"""Metrics from a single run."""
1919

2020
generated_tokens: int
21+
prompt_tokens: int
2122
tokens_per_sec: float
23+
prefill_tokens_per_sec: float
2224
model_load_time_ms: float
2325
total_inference_time_ms: float
2426
encoder_time_ms: float
@@ -28,7 +30,8 @@ class RunMetrics:
2830
def __repr__(self):
2931
return (
3032
f"Tokens: {self.generated_tokens}, "
31-
f"Throughput: {self.tokens_per_sec:.2f} t/s, "
33+
f"Prefill: {self.prefill_tokens_per_sec:.2f} t/s ({self.prompt_tokens} tokens), "
34+
f"Decode: {self.tokens_per_sec:.2f} t/s, "
3235
f"Model load: {self.model_load_time_ms:.0f}ms, "
3336
f"Total inference: {self.total_inference_time_ms:.0f}ms, "
3437
f"Encoder: {self.encoder_time_ms:.0f}ms, "
@@ -49,6 +52,7 @@ def parse_pytorch_observer_log(log_line: str) -> Optional[RunMetrics]:
4952

5053
# Extract values
5154
generated_tokens = data.get("generated_tokens", 0)
55+
prompt_tokens = data.get("prompt_tokens", 0)
5256
inference_start_ms = data.get("inference_start_ms", 0)
5357
inference_end_ms = data.get("inference_end_ms", 0)
5458
prompt_eval_end_ms = data.get("prompt_eval_end_ms", 0)
@@ -72,12 +76,20 @@ def parse_pytorch_observer_log(log_line: str) -> Optional[RunMetrics]:
7276
if generation_time_ms > 0
7377
else 0
7478
)
79+
80+
# Calculate prefill throughput
81+
prefill_tokens_per_sec = (
82+
(prompt_tokens / encoder_time_ms * 1000) if encoder_time_ms > 0 else 0
83+
)
84+
7585
model_load_time_ms = model_load_end_ms - model_load_start_ms
7686
first_token_latency_ms = first_token_ms - prompt_eval_end_ms
7787

7888
return RunMetrics(
7989
generated_tokens=generated_tokens,
90+
prompt_tokens=prompt_tokens,
8091
tokens_per_sec=tokens_per_sec,
92+
prefill_tokens_per_sec=prefill_tokens_per_sec,
8193
model_load_time_ms=model_load_time_ms,
8294
total_inference_time_ms=total_inference_time_ms,
8395
encoder_time_ms=encoder_time_ms,
@@ -505,6 +517,7 @@ class BenchmarkResults:
505517

506518
# Metrics
507519
throughput: MetricStats
520+
prefill_throughput: MetricStats
508521
model_load_time: MetricStats
509522
total_inference_time: MetricStats
510523
encoder_time: MetricStats
@@ -529,6 +542,10 @@ def to_dict(self) -> dict:
529542
"throughput_min": self.throughput.min_val,
530543
"throughput_max": self.throughput.max_val,
531544
"throughput_stdev": self.throughput.stdev,
545+
"prefill_throughput_mean": self.prefill_throughput.mean,
546+
"prefill_throughput_min": self.prefill_throughput.min_val,
547+
"prefill_throughput_max": self.prefill_throughput.max_val,
548+
"prefill_throughput_stdev": self.prefill_throughput.stdev,
532549
"model_load_time_mean": self.model_load_time.mean,
533550
"model_load_time_min": self.model_load_time.min_val,
534551
"model_load_time_max": self.model_load_time.max_val,
@@ -601,6 +618,13 @@ def to_v3_format(
601618
runner_type,
602619
base_extra_info,
603620
),
621+
self.prefill_throughput.create_v3_record(
622+
model_name_with_quant,
623+
backend,
624+
runner_name,
625+
runner_type,
626+
base_extra_info,
627+
),
604628
self.model_load_time.create_v3_record(
605629
model_name_with_quant,
606630
backend,
@@ -696,6 +720,11 @@ def create_metric_stats(
696720
"t/s",
697721
{"trimmed_runs": len(trimmed_throughput)},
698722
),
723+
prefill_throughput=create_metric_stats(
724+
"prefill_encoder_throughput(tokens/sec)",
725+
[r.prefill_tokens_per_sec for r in results],
726+
"t/s",
727+
),
699728
model_load_time=create_metric_stats(
700729
"model_load_time(ms)",
701730
[r.model_load_time_ms for r in results],
@@ -740,6 +769,7 @@ def print_summary(summary: BenchmarkResults) -> None:
740769

741770
# Print all metrics using their print_stats method
742771
summary.throughput.print_stats()
772+
summary.prefill_throughput.print_stats()
743773
summary.model_load_time.print_stats()
744774
summary.total_inference_time.print_stats()
745775
summary.encoder_time.print_stats()
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
Please analyze and summarize the following text in detail:
2+
3+
The Transformer architecture, introduced by Vaswani et al. in the 2017 paper "Attention Is All You Need," fundamentally reshaped the landscape of natural language processing and, more broadly, machine learning. Prior to its introduction, sequence modeling had been dominated by recurrent neural networks such as the Long Short-Term Memory (LSTM) and the Gated Recurrent Unit (GRU), as well as by convolutional approaches that attempted to capture local context efficiently. While these architectures achieved respectable results on a variety of tasks, they suffered from inherent limitations: recurrent computations could not be easily parallelized along the time dimension, gradients tended to vanish or explode across very long sequences, and convolutional models had difficulty capturing global dependencies without resorting to deep stacks of layers or specialized dilated kernels.
4+
5+
The Transformer addressed these limitations by replacing recurrence with a mechanism known as self-attention. In self-attention, every token in a sequence computes a weighted sum of representations from every other token, where the weights are determined by a compatibility function between learned queries and keys. This formulation has several appealing properties. First, the computation across positions is fully parallelizable, which dramatically accelerates training on modern accelerator hardware such as graphics processing units and tensor processing units. Second, the path length between any two tokens is constant, allowing the model to capture long-range dependencies far more directly than recurrent networks ever could. Third, the attention weights themselves carry interpretable information about which parts of the input the model is using to produce each output, offering a window into model behavior that earlier architectures rarely provided.
6+
7+
A standard Transformer block stacks multi-head self-attention with a position-wise feed-forward network, surrounded by residual connections and layer normalization. The multi-head design allows the model to attend to information from multiple representation subspaces simultaneously: each head learns its own projection of the input into queries, keys, and values, computes its own attention pattern, and contributes a slice of the final output. Empirically, different heads often specialize in different linguistic phenomena, with some focusing on syntactic relationships such as subject-verb agreement, others on coreference, and still others on broader semantic context.
8+
9+
Because self-attention is permutation invariant, the Transformer must inject information about the order of tokens explicitly. The original paper used fixed sinusoidal positional encodings added to the token embeddings, but subsequent work has explored a wide variety of alternatives. Learned positional embeddings, relative position representations, rotary positional embeddings, and attention-with-linear-biases schemes have all been proposed and adopted in different families of models. Each approach offers a different trade-off between expressiveness, generalization to longer contexts, and computational cost.
10+
11+
The original Transformer was an encoder-decoder model designed for machine translation, but the architecture quickly proved general enough to power a remarkable range of subsequent systems. Encoder-only models such as BERT framed pretraining as a masked language modeling task and dramatically improved performance on classification, question answering, and information retrieval. Decoder-only models such as the GPT family treated language modeling as next-token prediction at very large scale, and demonstrated that with sufficient data and compute, a single architecture could exhibit emergent capabilities such as few-shot learning, code generation, and chain-of-thought reasoning. Encoder-decoder variants such as T5 and BART unified many tasks under a common text-to-text framing, simplifying the engineering of multitask systems.
12+
13+
Scaling laws, established through systematic empirical study, have shown that Transformer performance improves predictably as a function of model parameters, dataset size, and compute budget. This insight motivated a wave of increasingly large models, culminating in dense networks with hundreds of billions of parameters and sparsely activated mixture-of-experts networks with trillions. Mixture-of-experts approaches in particular have become attractive because they decouple total parameter count from the per-token computation cost: a router network selects a small subset of experts for each token, allowing the model to grow in capacity without a proportional increase in inference latency. This trade-off makes mixture-of-experts especially appealing for deployment scenarios where memory bandwidth dominates compute as the bottleneck.
14+
15+
Beyond text, the Transformer has been adapted to images, audio, video, proteins, source code, and combinations thereof. Vision Transformers split images into patches and treat each patch as a token; speech models tokenize raw audio or spectrogram features; multimodal systems share a common Transformer backbone across modalities by aligning their embedding spaces. The flexibility of the architecture, combined with the practical advantages of large-scale pretraining followed by task-specific fine-tuning or in-context prompting, has made the Transformer the de facto foundation of contemporary deep learning research and applications.
16+
17+
Of course, the Transformer is not without its drawbacks. The quadratic memory and compute cost of standard self-attention with respect to sequence length remains a significant practical limitation. A vibrant subfield of research focuses on efficient attention variants, including sparse, low-rank, and kernelized formulations, as well as state-space models that recover linear complexity while preserving competitive accuracy. Long-context inference also stresses the key-value cache, motivating techniques such as paged attention, grouped-query attention, sliding-window attention, and aggressive quantization. Quantization in particular has become a critical tool for deploying large models on commodity hardware, with low-bit integer formats and weight-only quantization schemes enabling inference on consumer GPUs and edge devices that would otherwise be unable to host a model.
18+
19+
Tooling has co-evolved with the architecture. Compilers and runtimes such as ExecuTorch, TensorRT, vLLM, and various ONNX-based stacks specialize in lowering Transformer graphs onto target accelerators while applying optimizations such as kernel fusion, operator scheduling, and memory planning. These systems make it feasible to take a research model trained in a high-level framework and deploy it efficiently on production hardware ranging from data center GPUs to mobile system-on-chips. The end-to-end pipeline of training, fine-tuning, quantization, export, and runtime execution has become a recognizable engineering discipline in its own right.
20+
21+
Training a frontier-scale Transformer is itself a substantial systems undertaking. Modern pretraining runs combine data parallelism, tensor parallelism, pipeline parallelism, and sequence parallelism, often coordinated through libraries such as PyTorch FSDP, Megatron, and DeepSpeed ZeRO. Practitioners must balance compute and memory carefully, choosing micro-batch sizes that maximize accelerator utilization without exceeding device memory, designing checkpointing schemes that survive node failures over runs that can last for months, and overlapping communication with computation to hide network latency. Activation checkpointing trades extra computation for reduced memory pressure, while mixed precision training with bfloat16 or FP8 formats shrinks memory bandwidth requirements and unlocks newer hardware features.
22+
23+
Inference brings its own set of challenges. The autoregressive nature of decoder-only Transformers means each generated token requires a full forward pass, and the dominant cost shifts from raw matrix multiplication during prefill to memory-bandwidth-bound key-value cache reads during decode. Techniques such as speculative decoding, continuous batching, and prefix caching attempt to claw back utilization. For latency-sensitive deployments, careful kernel fusion, paged attention, and ahead-of-time compilation can reduce per-token overhead substantially, and the rise of small distilled or sparsely activated models offers an alternative path to acceptable quality at a fraction of the cost.
24+
25+
Looking ahead, the Transformer's dominance is being challenged by alternative architectures such as state-space models, linear recurrent networks, and hybrid designs that interleave attention with other mixing primitives. Whether any of these will displace the Transformer entirely remains to be seen, but it is already clear that the ideas the Transformer popularized — content-based mixing of tokens, parallelizable training, and large-scale pretraining followed by adaptation — will continue to shape the field for years to come.

.ci/scripts/test_cortex_m_e2e.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#!/usr/bin/env bash
22
# Copyright (c) Meta Platforms, Inc. and affiliates.
3+
# Copyright 2026 Arm Limited and/or its affiliates.
34
# All rights reserved.
45
#
56
# This source code is licensed under the BSD-style license found in the
@@ -18,7 +19,7 @@ mkdir -p "./cortex_m_e2e/${MODEL}"
1819
WORK_DIR=$(realpath "./cortex_m_e2e/${MODEL}")
1920

2021
echo "=== Exporting ${MODEL} with cortex-m55+int8 ==="
21-
python -m examples.arm.aot_arm_compiler \
22+
python -m backends.arm.scripts.aot_arm_compiler \
2223
-m "${MODEL}" \
2324
--target=cortex-m55+int8 \
2425
--quantize \
Lines changed: 29 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
# LICENSE file in the root directory of this source tree.
77

88
set -ex
9+
910
# shellcheck source=/dev/null
1011
source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
1112

@@ -50,21 +51,21 @@ PT2E_QUANTIZE="${PT2E_QUANTIZE:-}"
5051
# Default CMake Build Type to release mode
5152
CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}
5253

53-
if [[ $# -lt 5 ]]; then # Assuming 4 mandatory args
54-
echo "Expecting atleast 5 positional arguments"
55-
echo "Usage: [...]"
56-
fi
5754
if [[ -z "${MODEL_NAME:-}" ]]; then
5855
echo "Missing model name, exiting..."
5956
exit 1
6057
fi
6158

62-
6359
if [[ -z "${MODE:-}" ]]; then
6460
echo "Missing mode, choose openvino or xnnpack, exiting..."
6561
exit 1
6662
fi
6763

64+
if [[ -z "${VIDEO_PATH:-}" ]]; then
65+
echo "Missing video path, exiting..."
66+
exit 1
67+
fi
68+
6869
if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
6970
PYTHON_EXECUTABLE=python3
7071
fi
@@ -75,21 +76,13 @@ if [[ "${MODE}" =~ .*openvino.* ]]; then
7576
OPENVINO=ON
7677
TARGET_LIBS="$TARGET_LIBS openvino_backend "
7778

78-
git clone https://github.com/openvinotoolkit/openvino.git
79-
cd openvino && git b16b776ac119dafda51f69a80f1e6b7376d02c3b
80-
git submodule update --init --recursive
81-
sudo ./install_build_dependencies.sh
82-
mkdir build && cd build
83-
cmake .. -DCMAKE_BUILD_TYPE=Release -DENABLE_PYTHON=ON
84-
make -j$(nproc)
85-
86-
cd ..
87-
cmake --install build --prefix dist
88-
89-
source dist/setupvars.sh
90-
cd ../backends/openvino
91-
pip install -r requirements.txt
92-
cd ../../
79+
# Install specific OpenVINO runtime from pip.
80+
$PYTHON_EXECUTABLE -m pip install --pre openvino==2026.1.0.dev20260131 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
81+
$PYTHON_EXECUTABLE -m pip install -r backends/openvino/requirements.txt
82+
83+
# Set OPENVINO_LIB_PATH so the C++ demo runner can also find libopenvino_c.so.
84+
OPENVINO_LIB_PATH=$($PYTHON_EXECUTABLE -c "import openvino, os, glob; print(sorted(glob.glob(os.path.join(os.path.dirname(openvino.__file__), 'libs', 'libopenvino_c.so*')))[-1])")
85+
export OPENVINO_LIB_PATH
9386
else
9487
OPENVINO=OFF
9588
fi
@@ -103,9 +96,10 @@ fi
10396

10497
which "${PYTHON_EXECUTABLE}"
10598

99+
TORCH_URL=https://download.pytorch.org/whl/cpu
106100

107-
DIR="examples/models/yolo12"
108-
$PYTHON_EXECUTABLE -m pip install -r ${DIR}/requirements.txt
101+
DIR="examples/models/yolo26"
102+
$PYTHON_EXECUTABLE -m pip install --upgrade-strategy only-if-needed --extra-index-url "$TORCH_URL" -r ${DIR}/requirements.txt
109103

110104
cmake_install_executorch_libraries() {
111105
rm -rf cmake-out
@@ -142,11 +136,11 @@ cmake_install_executorch_libraries() {
142136

143137
echo $TARGET_LIBS
144138
export CMAKE_BUILD_ARGS="--target $TARGET_LIBS"
145-
pip install . --no-build-isolation
139+
$PYTHON_EXECUTABLE -m pip install . --no-build-isolation
146140
}
147141

148142
cmake_build_demo() {
149-
echo "Building yolo12 runner"
143+
echo "Building yolo26 runner"
150144
retry cmake \
151145
-DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
152146
-DUSE_OPENVINO_BACKEND="$OPENVINO" \
@@ -174,24 +168,29 @@ prepare_artifacts_upload() {
174168

175169

176170
# Export model.
177-
EXPORTED_MODEL_NAME="${MODEL_NAME}_fp32_${MODE}.pte"
178-
echo "Exporting ${EXPORTED_MODEL_NAME}"
179171
EXPORT_ARGS="--model_name=${MODEL_NAME} --backend=${MODE}"
172+
if [[ -n "${PT2E_QUANTIZE}" ]]; then
173+
EXPORTED_MODEL_NAME="${MODEL_NAME}_int8_${MODE}.pte"
174+
EXPORT_ARGS="${EXPORT_ARGS} --quantize --video_path=${VIDEO_PATH}"
175+
else
176+
EXPORTED_MODEL_NAME="${MODEL_NAME}_fp32_${MODE}.pte"
177+
fi
178+
echo "Exporting ${EXPORTED_MODEL_NAME}"
180179

181180
# Add dynamically linked library location
182181
cmake_install_executorch_libraries
183182

184-
$PYTHON_EXECUTABLE -m examples.models.yolo12.export_and_validate ${EXPORT_ARGS}
183+
$PYTHON_EXECUTABLE -m examples.models.yolo26.export_and_validate ${EXPORT_ARGS}
185184

186185

187186
RUNTIME_ARGS="--model_path=${EXPORTED_MODEL_NAME} --input_path=${VIDEO_PATH}"
188187
# Check build tool.
189188
cmake_build_demo
190-
# Run yolo12 runner
189+
# Run yolo26 runner
191190
NOW=$(date +"%H:%M:%S")
192-
echo "Starting to run yolo12 runner at ${NOW}"
191+
echo "Starting to run yolo26 runner at ${NOW}"
193192
# shellcheck source=/dev/null
194-
cmake-out/examples/models/yolo12/Yolo12DetectionDemo ${RUNTIME_ARGS} > result.txt
193+
cmake-out/examples/models/yolo26/Yolo26DetectionDemo ${RUNTIME_ARGS} > result.txt
195194
NOW=$(date +"%H:%M:%S")
196195
echo "Finished at ${NOW}"
197196

.claude/settings.json

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
{
2+
"hooks": {
3+
"PreToolUse": [
4+
{
5+
"matcher": "Bash",
6+
"hooks": [
7+
{
8+
"type": "command",
9+
"command": "if [ -x .wiki/fb/hooks/resync-guard.sh ]; then bash .wiki/fb/hooks/resync-guard.sh; fi"
10+
}
11+
]
12+
}
13+
]
14+
}
15+
}

0 commit comments

Comments
 (0)