Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions .github/workflows/build-virtgpu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
name: CI (virtgpu)

on:
workflow_dispatch: # allows manual triggering
push:
branches:
- master
paths: [
'.github/workflows/build-virtgpu.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp'
]

pull_request:
types: [opened, synchronize, reopened]
paths: [
'.github/workflows/build-virtgpu.yml',
'ggml/src/ggml-virtgpu/**'
]

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true

jobs:
ubuntu-24-virtgpu:
runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}

steps:
- name: Clone
id: checkout
uses: actions/checkout@v6

- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install -y build-essential libdrm-dev pkg-config libssl-dev

- name: Build
id: cmake_build
run: |
cmake -B build \
-DGGML_VIRTGPU=ON \
-DGGML_VIRTGPU_BACKEND=ON
cmake --build build --config Release -j $(nproc)
41 changes: 5 additions & 36 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -622,10 +622,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
string_process_escapes(seq_breaker);
}
for (auto & pair : params.speculative.draft.replacements) {
string_process_escapes(pair.first);
string_process_escapes(pair.second);
}
}

if (!params.kv_overrides.empty()) {
Expand Down Expand Up @@ -3518,13 +3514,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.speculative.draft.p_min = std::stof(value);
}
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_P_MIN"));
add_opt(common_arg(
{"--spec-draft-ctx-size", "-cd", "--ctx-size-draft"}, "N",
string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.draft.n_ctx),
[](common_params & params, int value) {
params.speculative.draft.n_ctx = value;
}
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_CTX_SIZE"));
add_opt(common_arg(
{"--spec-draft-device", "-devd", "--device-draft"}, "<dev1,dev2,..>",
"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
Expand Down Expand Up @@ -3561,32 +3550,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_MODEL"));
add_opt(common_arg(
{"--spec-draft-replace", "--spec-replace"}, "TARGET", "DRAFT",
"translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
[](common_params & params, const std::string & tgt, const std::string & dft) {
params.speculative.draft.replacements.push_back({ tgt, dft });
}
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--spec-type"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]",
{"--spec-type"}, common_speculative_all_types_str(),
string_format("type of speculative decoding to use when no draft model is provided (default: %s)\n",
common_speculative_type_to_str(params.speculative.type).c_str()),
common_speculative_type_name_str(params.speculative.types).c_str()),
[](common_params & params, const std::string & value) {
if (value == "none") {
params.speculative.type = COMMON_SPECULATIVE_TYPE_NONE;
} else if (value == "ngram-cache") {
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_CACHE;
} else if (value == "ngram-simple") {
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE;
} else if (value == "ngram-map-k") {
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K;
} else if (value == "ngram-map-k4v") {
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V;
} else if (value == "ngram-mod") {
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
} else {
throw std::invalid_argument("unknown speculative decoding type without draft model");
}
const auto enabled_types = string_split<std::string>(value, ',');
params.speculative.types = common_speculative_types_from_names(enabled_types);
}
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_TYPE"));
add_opt(common_arg(
Expand Down Expand Up @@ -4075,7 +4044,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--spec-default"},
string_format("enable default speculative decoding config"),
[](common_params & params) {
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
params.speculative.types = { COMMON_SPECULATIVE_TYPE_NGRAM_MOD };
params.speculative.ngram_mod.n_match = 24;
params.speculative.ngram_mod.n_min = 48;
params.speculative.ngram_mod.n_max = 64;
Expand Down
101 changes: 100 additions & 1 deletion common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1422,7 +1422,7 @@ common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx) {

// try to remove the last tokens
if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
LOG_WRN("%s: the target context does not support partial sequence removal\n", __func__);
LOG_WRN("%s: the context does not support partial sequence removal\n", __func__);
res = COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
goto done;
}
Expand Down Expand Up @@ -1960,3 +1960,102 @@ bool common_prompt_batch_decode(

return true;
}

size_t common_prompt_checkpoint::size() const {
return data_tgt.size() + data_dft.size();
}

bool common_prompt_checkpoint::empty() const {
return data_tgt.empty();
}

void common_prompt_checkpoint::clear() {
n_tokens = 0;

pos_min = 0;
pos_max = 0;

data_tgt.clear();
data_dft.clear();
}

void common_prompt_checkpoint::update_pos(
int64_t n_tokens,
llama_pos pos_min,
llama_pos pos_max) {
this->n_tokens = n_tokens;
this->pos_min = pos_min;
this->pos_max = pos_max;
}

void common_prompt_checkpoint::update_tgt(
llama_context * ctx,
llama_seq_id seq_id,
llama_state_seq_flags flags) {
if (ctx == nullptr) {
return;
}

const size_t ckpt_size = llama_state_seq_get_size_ext(ctx, seq_id, flags);

data_tgt.resize(ckpt_size);

const size_t n = llama_state_seq_get_data_ext(ctx, data_tgt.data(), ckpt_size, seq_id, flags);
if (n != ckpt_size) {
GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", ckpt_size, n);
}
}

void common_prompt_checkpoint::update_dft(
llama_context * ctx,
llama_seq_id seq_id,
llama_state_seq_flags flags) {
if (ctx == nullptr) {
return;
}

const size_t ckpt_size = llama_state_seq_get_size_ext(ctx, seq_id, flags);

data_dft.resize(ckpt_size);

const size_t n = llama_state_seq_get_data_ext(ctx, data_dft.data(), ckpt_size, seq_id, flags);
if (n != ckpt_size) {
GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", ckpt_size, n);
}
}

void common_prompt_checkpoint::load_tgt(
llama_context * ctx,
llama_seq_id seq_id,
llama_state_seq_flags flags) const {
if (ctx == nullptr) {
return;
}

if (data_tgt.empty()) {
return;
}

const size_t n = llama_state_seq_set_data_ext(ctx, data_tgt.data(), data_tgt.size(), seq_id, flags);
if (n != data_tgt.size()) {
GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", data_tgt.size(), n);
}
}

void common_prompt_checkpoint::load_dft(
llama_context * ctx,
llama_seq_id seq_id,
llama_state_seq_flags flags) const {
if (ctx == nullptr) {
return;
}

if (data_dft.empty()) {
return;
}

const size_t n = llama_state_seq_set_data_ext(ctx, data_dft.data(), data_dft.size(), seq_id, flags);
if (n != data_dft.size()) {
GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", data_dft.size(), n);
}
}
59 changes: 47 additions & 12 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -295,8 +295,6 @@ struct common_params_model {
std::string name = ""; // in format <user>/<model>[:<tag>] (tag is optional) // NOLINT
};

struct common_ngram_mod;

// draft-model-based speculative decoding parameters
struct common_params_speculative_draft {
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
Expand All @@ -307,11 +305,9 @@ struct common_params_speculative_draft {

common_params_model mparams;

llama_model * model = nullptr; // a llama_model that can be shared by multiple speculative contexts

llama_context_params cparams; // these are the parameters for the draft llama_context
llama_context * ctx_tgt = nullptr;
llama_context * ctx_dft = nullptr;

int32_t n_ctx = 0; // draft context size
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)

ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
Expand All @@ -322,7 +318,6 @@ struct common_params_speculative_draft {

std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
};

Expand All @@ -331,9 +326,6 @@ struct common_params_speculative_ngram_mod {

int32_t n_max = 64;
int32_t n_min = 48;

// shared instance of the ngram container for all speculative decoding contexts
std::shared_ptr<common_ngram_mod> obj;
};

struct common_params_speculative_ngram_map {
Expand All @@ -348,8 +340,7 @@ struct common_params_speculative_ngram_cache {
};

struct common_params_speculative {
// TODO: become a vector in order to support "chains of speculators"
common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE;
std::vector<enum common_speculative_type> types = { COMMON_SPECULATIVE_TYPE_NONE };

common_params_speculative_draft draft;

Expand Down Expand Up @@ -1026,3 +1017,47 @@ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std

// "adamw" or "sgd" (case insensitive)
enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *);

//
// prompt utils
//

struct common_prompt_checkpoint {
int64_t n_tokens;

llama_pos pos_min;
llama_pos pos_max;

std::vector<uint8_t> data_tgt;
std::vector<uint8_t> data_dft;

size_t size() const;

bool empty() const;
void clear();

void update_pos(
int64_t n_tokens,
llama_pos pos_min,
llama_pos pos_max);

void update_tgt(
llama_context * ctx,
llama_seq_id seq_id,
llama_state_seq_flags flags);

void update_dft(
llama_context * ctx,
llama_seq_id seq_id,
llama_state_seq_flags flags);

void load_tgt(
llama_context * ctx,
llama_seq_id seq_id,
llama_state_seq_flags flags) const;

void load_dft(
llama_context * ctx,
llama_seq_id seq_id,
llama_state_seq_flags flags) const;
};
Loading