Skip to content

Commit ac69057

Browse files
committed
Add the EAGLE-3 speculative-decoding runner (CUDA)
A C++ runner that drives the speculator .pte with the shifted (vLLM-EAGLE) scheme: the draft pairs the target hidden state at position t with token t+1, so each round runs one target forward (target_verify) and reseeds the next draft chain from the hidden states verify already produced -- no standalone target decode. Greedy verification keeps output identical to greedy target decoding. target_verify runs on stable input buffers and can be captured as a CUDA graph. It requires the .pte metadata (fails loudly if absent) and enforces the exported prefill range [get_min_prefill_chunk, get_max_prefill_chunk] (no chunking). The prefill bonus token is always emitted; the speculative loop runs only when more tokens are requested, the bonus was not EOS, and a K-token verify window fits within get_max_seq_len (so a one-token or near-context request returns without seeding the draft). The chat template and stop tokens are flags defaulting to Gemma 4 IT (--chat_prefix/--chat_suffix/--stop_ids/--stop_token, --bos_id -1 to skip) so other target/tokenizer pairs run without code changes. Device-to-host reads are error-checked; the printed tau excludes the free prefill token. Authored with assistance from Claude Code. ghstack-source-id: 54fea57 ghstack-comment-id: 4661634339 Pull-Request: #20156
1 parent 56f6cdd commit ac69057

3 files changed

Lines changed: 629 additions & 0 deletions

File tree

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
cmake_minimum_required(VERSION 3.24)
8+
project(eagle3_speculator)
9+
10+
set(CMAKE_CXX_STANDARD 17)
11+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
12+
13+
set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
14+
15+
include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
16+
17+
set(_common_include_directories ${EXECUTORCH_ROOT}/..)
18+
19+
# gflags
20+
set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
21+
find_package(gflags REQUIRED)
22+
23+
# executorch
24+
list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../..)
25+
find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH)
26+
executorch_target_link_options_shared_lib(executorch)
27+
28+
set(link_libraries executorch gflags)
29+
30+
# CPU ops (host-side helpers not delegated to CUDA)
31+
list(APPEND link_libraries optimized_native_cpu_ops_lib cpublas eigen_blas)
32+
executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib)
33+
34+
# Extensions
35+
list(
36+
APPEND
37+
link_libraries
38+
extension_llm_runner
39+
extension_module
40+
extension_data_loader
41+
extension_tensor
42+
extension_flat_tensor
43+
)
44+
45+
# Backend: CUDA (AOTI). The EAGLE-3 speculator export is CUDA-only.
46+
if(EXECUTORCH_BUILD_CUDA)
47+
find_package(CUDAToolkit REQUIRED)
48+
list(APPEND link_libraries aoti_cuda_backend)
49+
executorch_target_link_options_shared_lib(aoti_cuda_backend)
50+
add_compile_definitions(EXECUTORCH_BUILD_CUDA)
51+
else()
52+
message(FATAL_ERROR "EAGLE-3 speculator runner requires EXECUTORCH_BUILD_CUDA=ON")
53+
endif()
54+
55+
# Tokenizer (HuggingFace tokenizer.json)
56+
list(APPEND link_libraries tokenizers::tokenizers)
57+
58+
add_executable(eagle3_speculator_runner main.cpp)
59+
target_include_directories(
60+
eagle3_speculator_runner PUBLIC ${_common_include_directories}
61+
)
62+
target_link_libraries(eagle3_speculator_runner PUBLIC ${link_libraries})
63+
64+
if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
65+
target_link_options_gc_sections(eagle3_speculator_runner)
66+
if(NOT APPLE AND NOT MSVC)
67+
target_link_options(eagle3_speculator_runner PRIVATE "LINKER:-s")
68+
endif()
69+
endif()
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
{
2+
"version": 6,
3+
"configurePresets": [
4+
{
5+
"name": "eagle3-cuda",
6+
"displayName": "EAGLE-3 speculator runner (CUDA)",
7+
"binaryDir": "${sourceDir}/../../../cmake-out/examples/models/eagle3",
8+
"cacheVariables": {
9+
"CMAKE_BUILD_TYPE": "Release",
10+
"CMAKE_FIND_ROOT_PATH": "${sourceDir}/../../../cmake-out",
11+
"CMAKE_PREFIX_PATH": "${sourceDir}/../../../cmake-out",
12+
"EXECUTORCH_BUILD_CUDA": "ON"
13+
},
14+
"condition": {
15+
"type": "inList",
16+
"string": "${hostSystemName}",
17+
"list": ["Linux", "Windows"]
18+
}
19+
}
20+
],
21+
"buildPresets": [
22+
{
23+
"name": "eagle3-cuda",
24+
"displayName": "Build EAGLE-3 speculator runner (CUDA)",
25+
"configurePreset": "eagle3-cuda",
26+
"targets": ["eagle3_speculator_runner"]
27+
}
28+
]
29+
}

0 commit comments

Comments
 (0)