Skip to content

Commit 6271ade

Browse files
committed
feat(server): add native mixed-backend draft placement
1 parent 0c103a3 commit 6271ade

9 files changed

Lines changed: 341 additions & 70 deletions

File tree

dflash/CMakeLists.txt

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -671,6 +671,27 @@ if(DFLASH27B_TESTS)
671671
endif()
672672
endif()
673673

674+
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/ipc/dflash_draft_ipc_main.cpp")
675+
add_executable(dflash_draft_ipc_daemon
676+
src/ipc/dflash_draft_ipc_main.cpp
677+
)
678+
target_include_directories(dflash_draft_ipc_daemon PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
679+
if(DFLASH27B_GPU_BACKEND STREQUAL "hip")
680+
target_compile_definitions(dflash_draft_ipc_daemon PRIVATE DFLASH27B_BACKEND_HIP=1 GGML_USE_HIP)
681+
else()
682+
target_compile_definitions(dflash_draft_ipc_daemon PRIVATE
683+
DFLASH27B_BACKEND_CUDA=1
684+
DFLASH27B_CUDA_MIN_SM=${_dflash_cuda_min_sm})
685+
endif()
686+
target_link_libraries(dflash_draft_ipc_daemon PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET} pthread)
687+
if(DFLASH27B_GPU_BACKEND STREQUAL "cuda")
688+
find_package(CUDAToolkit REQUIRED)
689+
target_link_libraries(dflash_draft_ipc_daemon PRIVATE CUDA::cudart)
690+
else()
691+
target_link_libraries(dflash_draft_ipc_daemon PRIVATE hip::host)
692+
endif()
693+
endif()
694+
674695
# Tokenizer test harness (no GPU needed — links static lib for tokenizer + GGUF reader)
675696
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_tokenizer_harness.cpp")
676697
add_executable(test_tokenizer_harness test/test_tokenizer_harness.cpp)

dflash/src/common/backend_factory.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@ std::string detect_arch(const char * model_path) {
1717
return info.arch;
1818
}
1919

20+
bool arch_supports_remote_draft(const std::string & arch) {
21+
return arch == "qwen35";
22+
}
23+
2024
std::unique_ptr<ModelBackend> create_backend(const BackendArgs & args) {
2125
if (!args.model_path) {
2226
std::fprintf(stderr, "[backend_factory] model_path is null\n");
@@ -38,6 +42,7 @@ std::unique_ptr<ModelBackend> create_backend(const BackendArgs & args) {
3842
cfg.draft_path = args.draft_path;
3943
cfg.device = args.device;
4044
cfg.draft_gpu = args.draft_device.gpu;
45+
cfg.remote_draft = args.remote_draft;
4146
cfg.stream_fd = args.stream_fd;
4247
cfg.fa_window = args.fa_window;
4348
cfg.kq_stride_pad = args.kq_stride_pad;

dflash/src/common/backend_factory.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
#include "model_backend.h"
1414
#include "placement/placement_config.h"
15+
#include "placement/remote_draft_config.h"
1516

1617
#include <memory>
1718
#include <string>
@@ -31,6 +32,7 @@ struct BackendArgs {
3132
// Device placement
3233
DevicePlacement device;
3334
DevicePlacement draft_device;
35+
RemoteDraftConfig remote_draft;
3436

3537
// I/O — only used when running under daemon_loop (legacy). The new
3638
// server passes -1 and uses on_token callbacks instead.
@@ -62,4 +64,6 @@ std::unique_ptr<ModelBackend> create_backend(const BackendArgs & args);
6264
// Useful for early dispatch (e.g. printing which backend will be used).
6365
std::string detect_arch(const char * model_path);
6466

67+
bool arch_supports_remote_draft(const std::string & arch);
68+
6569
} // namespace dflash::common

dflash/src/common/model_backend.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,12 @@ struct ModelBackend {
183183
// growth over time. Default is a no-op.
184184
virtual void release_scratch() {}
185185

186+
// Return true when the backend can route draft execution through the
187+
// common remote-draft IPC transport. Model families that do not implement
188+
// the DFlash feature boundary keep the default false and are rejected by
189+
// the server before startup.
190+
virtual bool supports_remote_draft() const { return false; }
191+
186192
// ── Cleanup ──────────────────────────────────────────────────────
187193
// Release all resources (weights, cache, snapshots, drafter).
188194
// Called by run_daemon() before returning.
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
// Standalone DFlash draft IPC daemon entry point.
2+
3+
#include "dflash_draft_ipc.h"
4+
5+
#include <algorithm>
6+
#include <cstdio>
7+
#include <cstdlib>
8+
#include <cstring>
9+
10+
using namespace dflash::common;
11+
12+
int main(int argc, char ** argv) {
13+
if (argc < 3 || std::strcmp(argv[1], "--draft-ipc-daemon") != 0) {
14+
std::fprintf(stderr,
15+
"usage: %s --draft-ipc-daemon <draft.safetensors|draft.gguf> "
16+
"--ring-cap=N --stream-fd=FD [--draft-gpu=N]\n",
17+
argv[0]);
18+
return 2;
19+
}
20+
21+
const char * draft_path = argv[2];
22+
int ring_cap = 4096;
23+
int draft_gpu = 0;
24+
int stream_fd = -1;
25+
for (int i = 3; i < argc; i++) {
26+
if (std::strncmp(argv[i], "--ring-cap=", 11) == 0) {
27+
ring_cap = std::atoi(argv[i] + 11);
28+
} else if (std::strcmp(argv[i], "--ring-cap") == 0) {
29+
if (i + 1 < argc) ring_cap = std::atoi(argv[++i]);
30+
} else if (std::strncmp(argv[i], "--draft-gpu=", 12) == 0) {
31+
draft_gpu = std::max(0, std::atoi(argv[i] + 12));
32+
} else if (std::strcmp(argv[i], "--draft-gpu") == 0) {
33+
if (i + 1 < argc) draft_gpu = std::max(0, std::atoi(argv[++i]));
34+
} else if (std::strncmp(argv[i], "--stream-fd=", 12) == 0) {
35+
stream_fd = std::atoi(argv[i] + 12);
36+
} else if (std::strcmp(argv[i], "--stream-fd") == 0) {
37+
if (i + 1 < argc) stream_fd = std::atoi(argv[++i]);
38+
} else {
39+
std::fprintf(stderr, "[draft-ipc-daemon] unknown option: %s\n", argv[i]);
40+
return 2;
41+
}
42+
}
43+
44+
return run_dflash_draft_ipc_daemon(draft_path, ring_cap, draft_gpu, stream_fd);
45+
}
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
// Remote draft execution configuration for mixed-backend target/draft placement.
2+
3+
#pragma once
4+
5+
#include <string>
6+
7+
namespace dflash::common {
8+
9+
struct RemoteDraftConfig {
10+
std::string ipc_bin;
11+
std::string work_dir;
12+
int ring_cap = 0;
13+
14+
bool enabled() const { return !ipc_bin.empty(); }
15+
bool has_aux_options() const { return !work_dir.empty() || ring_cap > 0; }
16+
};
17+
18+
} // namespace dflash::common

0 commit comments

Comments
 (0)