Skip to content

Commit 5192020

Browse files
Merge pull request #513 from janhq/update-dev-from-master-2026-05-12-01-03
Sync master with upstream release b9113
2 parents d72653f + 1ec7ba0 commit 5192020

43 files changed

Lines changed: 5464 additions & 3984 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
name: CI (virtgpu)
2+
3+
on:
4+
workflow_dispatch: # allows manual triggering
5+
push:
6+
branches:
7+
- master
8+
paths: [
9+
'.github/workflows/build-virtgpu.yml',
10+
'**/CMakeLists.txt',
11+
'**/.cmake',
12+
'**/*.h',
13+
'**/*.hpp',
14+
'**/*.c',
15+
'**/*.cpp'
16+
]
17+
18+
pull_request:
19+
types: [opened, synchronize, reopened]
20+
paths: [
21+
'.github/workflows/build-virtgpu.yml',
22+
'ggml/src/ggml-virtgpu/**'
23+
]
24+
25+
concurrency:
26+
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
27+
cancel-in-progress: true
28+
29+
jobs:
30+
ubuntu-24-virtgpu:
31+
runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
32+
33+
steps:
34+
- name: Clone
35+
id: checkout
36+
uses: actions/checkout@v6
37+
38+
- name: Dependencies
39+
id: depends
40+
run: |
41+
sudo apt-get update
42+
sudo apt-get install -y build-essential libdrm-dev pkg-config libssl-dev
43+
44+
- name: Build
45+
id: cmake_build
46+
run: |
47+
cmake -B build \
48+
-DGGML_VIRTGPU=ON \
49+
-DGGML_VIRTGPU_BACKEND=ON
50+
cmake --build build --config Release -j $(nproc)

common/arg.cpp

Lines changed: 5 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -622,10 +622,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
622622
for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
623623
string_process_escapes(seq_breaker);
624624
}
625-
for (auto & pair : params.speculative.draft.replacements) {
626-
string_process_escapes(pair.first);
627-
string_process_escapes(pair.second);
628-
}
629625
}
630626

631627
if (!params.kv_overrides.empty()) {
@@ -3518,13 +3514,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
35183514
params.speculative.draft.p_min = std::stof(value);
35193515
}
35203516
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_P_MIN"));
3521-
add_opt(common_arg(
3522-
{"--spec-draft-ctx-size", "-cd", "--ctx-size-draft"}, "N",
3523-
string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.draft.n_ctx),
3524-
[](common_params & params, int value) {
3525-
params.speculative.draft.n_ctx = value;
3526-
}
3527-
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_CTX_SIZE"));
35283517
add_opt(common_arg(
35293518
{"--spec-draft-device", "-devd", "--device-draft"}, "<dev1,dev2,..>",
35303519
"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
@@ -3561,32 +3550,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
35613550
}
35623551
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_MODEL"));
35633552
add_opt(common_arg(
3564-
{"--spec-draft-replace", "--spec-replace"}, "TARGET", "DRAFT",
3565-
"translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
3566-
[](common_params & params, const std::string & tgt, const std::string & dft) {
3567-
params.speculative.draft.replacements.push_back({ tgt, dft });
3568-
}
3569-
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
3570-
add_opt(common_arg(
3571-
{"--spec-type"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]",
3553+
{"--spec-type"}, common_speculative_all_types_str(),
35723554
string_format("type of speculative decoding to use when no draft model is provided (default: %s)\n",
3573-
common_speculative_type_to_str(params.speculative.type).c_str()),
3555+
common_speculative_type_name_str(params.speculative.types).c_str()),
35743556
[](common_params & params, const std::string & value) {
3575-
if (value == "none") {
3576-
params.speculative.type = COMMON_SPECULATIVE_TYPE_NONE;
3577-
} else if (value == "ngram-cache") {
3578-
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_CACHE;
3579-
} else if (value == "ngram-simple") {
3580-
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE;
3581-
} else if (value == "ngram-map-k") {
3582-
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K;
3583-
} else if (value == "ngram-map-k4v") {
3584-
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V;
3585-
} else if (value == "ngram-mod") {
3586-
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
3587-
} else {
3588-
throw std::invalid_argument("unknown speculative decoding type without draft model");
3589-
}
3557+
const auto enabled_types = string_split<std::string>(value, ',');
3558+
params.speculative.types = common_speculative_types_from_names(enabled_types);
35903559
}
35913560
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_TYPE"));
35923561
add_opt(common_arg(
@@ -4075,7 +4044,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
40754044
{"--spec-default"},
40764045
string_format("enable default speculative decoding config"),
40774046
[](common_params & params) {
4078-
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
4047+
params.speculative.types = { COMMON_SPECULATIVE_TYPE_NGRAM_MOD };
40794048
params.speculative.ngram_mod.n_match = 24;
40804049
params.speculative.ngram_mod.n_min = 48;
40814050
params.speculative.ngram_mod.n_max = 64;

common/common.cpp

Lines changed: 100 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1422,7 +1422,7 @@ common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx) {
14221422

14231423
// try to remove the last tokens
14241424
if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
1425-
LOG_WRN("%s: the target context does not support partial sequence removal\n", __func__);
1425+
LOG_WRN("%s: the context does not support partial sequence removal\n", __func__);
14261426
res = COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
14271427
goto done;
14281428
}
@@ -1960,3 +1960,102 @@ bool common_prompt_batch_decode(
19601960

19611961
return true;
19621962
}
1963+
1964+
size_t common_prompt_checkpoint::size() const {
1965+
return data_tgt.size() + data_dft.size();
1966+
}
1967+
1968+
bool common_prompt_checkpoint::empty() const {
1969+
return data_tgt.empty();
1970+
}
1971+
1972+
void common_prompt_checkpoint::clear() {
1973+
n_tokens = 0;
1974+
1975+
pos_min = 0;
1976+
pos_max = 0;
1977+
1978+
data_tgt.clear();
1979+
data_dft.clear();
1980+
}
1981+
1982+
void common_prompt_checkpoint::update_pos(
1983+
int64_t n_tokens,
1984+
llama_pos pos_min,
1985+
llama_pos pos_max) {
1986+
this->n_tokens = n_tokens;
1987+
this->pos_min = pos_min;
1988+
this->pos_max = pos_max;
1989+
}
1990+
1991+
void common_prompt_checkpoint::update_tgt(
1992+
llama_context * ctx,
1993+
llama_seq_id seq_id,
1994+
llama_state_seq_flags flags) {
1995+
if (ctx == nullptr) {
1996+
return;
1997+
}
1998+
1999+
const size_t ckpt_size = llama_state_seq_get_size_ext(ctx, seq_id, flags);
2000+
2001+
data_tgt.resize(ckpt_size);
2002+
2003+
const size_t n = llama_state_seq_get_data_ext(ctx, data_tgt.data(), ckpt_size, seq_id, flags);
2004+
if (n != ckpt_size) {
2005+
GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", ckpt_size, n);
2006+
}
2007+
}
2008+
2009+
void common_prompt_checkpoint::update_dft(
2010+
llama_context * ctx,
2011+
llama_seq_id seq_id,
2012+
llama_state_seq_flags flags) {
2013+
if (ctx == nullptr) {
2014+
return;
2015+
}
2016+
2017+
const size_t ckpt_size = llama_state_seq_get_size_ext(ctx, seq_id, flags);
2018+
2019+
data_dft.resize(ckpt_size);
2020+
2021+
const size_t n = llama_state_seq_get_data_ext(ctx, data_dft.data(), ckpt_size, seq_id, flags);
2022+
if (n != ckpt_size) {
2023+
GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", ckpt_size, n);
2024+
}
2025+
}
2026+
2027+
void common_prompt_checkpoint::load_tgt(
2028+
llama_context * ctx,
2029+
llama_seq_id seq_id,
2030+
llama_state_seq_flags flags) const {
2031+
if (ctx == nullptr) {
2032+
return;
2033+
}
2034+
2035+
if (data_tgt.empty()) {
2036+
return;
2037+
}
2038+
2039+
const size_t n = llama_state_seq_set_data_ext(ctx, data_tgt.data(), data_tgt.size(), seq_id, flags);
2040+
if (n != data_tgt.size()) {
2041+
GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", data_tgt.size(), n);
2042+
}
2043+
}
2044+
2045+
void common_prompt_checkpoint::load_dft(
2046+
llama_context * ctx,
2047+
llama_seq_id seq_id,
2048+
llama_state_seq_flags flags) const {
2049+
if (ctx == nullptr) {
2050+
return;
2051+
}
2052+
2053+
if (data_dft.empty()) {
2054+
return;
2055+
}
2056+
2057+
const size_t n = llama_state_seq_set_data_ext(ctx, data_dft.data(), data_dft.size(), seq_id, flags);
2058+
if (n != data_dft.size()) {
2059+
GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", data_dft.size(), n);
2060+
}
2061+
}

common/common.h

Lines changed: 47 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -295,8 +295,6 @@ struct common_params_model {
295295
std::string name = ""; // in format <user>/<model>[:<tag>] (tag is optional) // NOLINT
296296
};
297297

298-
struct common_ngram_mod;
299-
300298
// draft-model-based speculative decoding parameters
301299
struct common_params_speculative_draft {
302300
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
@@ -307,11 +305,9 @@ struct common_params_speculative_draft {
307305

308306
common_params_model mparams;
309307

310-
llama_model * model = nullptr; // a llama_model that can be shared by multiple speculative contexts
311-
312-
llama_context_params cparams; // these are the parameters for the draft llama_context
308+
llama_context * ctx_tgt = nullptr;
309+
llama_context * ctx_dft = nullptr;
313310

314-
int32_t n_ctx = 0; // draft context size
315311
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
316312

317313
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
@@ -322,7 +318,6 @@ struct common_params_speculative_draft {
322318

323319
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
324320

325-
std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
326321
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
327322
};
328323

@@ -331,9 +326,6 @@ struct common_params_speculative_ngram_mod {
331326

332327
int32_t n_max = 64;
333328
int32_t n_min = 48;
334-
335-
// shared instance of the ngram container for all speculative decoding contexts
336-
std::shared_ptr<common_ngram_mod> obj;
337329
};
338330

339331
struct common_params_speculative_ngram_map {
@@ -348,8 +340,7 @@ struct common_params_speculative_ngram_cache {
348340
};
349341

350342
struct common_params_speculative {
351-
// TODO: become a vector in order to support "chains of speculators"
352-
common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE;
343+
std::vector<enum common_speculative_type> types = { COMMON_SPECULATIVE_TYPE_NONE };
353344

354345
common_params_speculative_draft draft;
355346

@@ -1026,3 +1017,47 @@ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std
10261017

10271018
// "adamw" or "sgd" (case insensitive)
10281019
enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *);
1020+
1021+
//
1022+
// prompt utils
1023+
//
1024+
1025+
struct common_prompt_checkpoint {
1026+
int64_t n_tokens;
1027+
1028+
llama_pos pos_min;
1029+
llama_pos pos_max;
1030+
1031+
std::vector<uint8_t> data_tgt;
1032+
std::vector<uint8_t> data_dft;
1033+
1034+
size_t size() const;
1035+
1036+
bool empty() const;
1037+
void clear();
1038+
1039+
void update_pos(
1040+
int64_t n_tokens,
1041+
llama_pos pos_min,
1042+
llama_pos pos_max);
1043+
1044+
void update_tgt(
1045+
llama_context * ctx,
1046+
llama_seq_id seq_id,
1047+
llama_state_seq_flags flags);
1048+
1049+
void update_dft(
1050+
llama_context * ctx,
1051+
llama_seq_id seq_id,
1052+
llama_state_seq_flags flags);
1053+
1054+
void load_tgt(
1055+
llama_context * ctx,
1056+
llama_seq_id seq_id,
1057+
llama_state_seq_flags flags) const;
1058+
1059+
void load_dft(
1060+
llama_context * ctx,
1061+
llama_seq_id seq_id,
1062+
llama_state_seq_flags flags) const;
1063+
};

0 commit comments

Comments
 (0)