Skip to content

Commit c073320

Browse files
committed
add temp hack to not use fit with gemma4, rm later
1 parent 1e4fb9f commit c073320

1 file changed

Lines changed: 18 additions & 1 deletion

File tree

tools/server/server-context.cpp

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include "fit.h"
1212
#include "llama.h"
1313
#include "../../src/llama-ext.h" // staging API: llama_set_mtp_source
14+
#include "ggml-cpp.h"
1415
#include "log.h"
1516
#include "sampling.h"
1617
#include "speculative.h"
@@ -828,11 +829,27 @@ struct server_context_impl {
828829
}
829830
cparams_dft.n_rs_seq = 0;
830831

832+
bool skip_measure = false;
833+
//TODO: remove this
834+
if (spec_mtp && has_draft) {
835+
struct gguf_init_params meta_params = {
836+
/* .no_alloc = */ true,
837+
/* .ctx = */ nullptr,
838+
};
839+
gguf_context_ptr meta(gguf_init_from_file(params_dft.model.path.c_str(), meta_params));
840+
841+
if (std::string(gguf_get_val_str(meta.get(), gguf_find_key(meta.get(), "general.architecture"))) == "gemma4-assistant") {
842+
skip_measure = true;
843+
SRV_WRN("[spec] skipping --fit memory measurement for Gemma 4 assistant draft model '%s'\n",
844+
params_dft.model.path.c_str());
845+
}
846+
}
847+
831848
std::vector<ggml_backend_dev_t> devs;
832849
uint32_t hp_ngl = 0;
833850
uint32_t hp_nct = 0;
834851
uint32_t hp_nex = 0;
835-
try {
852+
if (!skip_measure) try {
836853
auto dmd = common_get_device_memory_data(
837854
params_dft.model.path.c_str(), &mparams_dft, &cparams_dft,
838855
devs, hp_ngl, hp_nct, hp_nex, GGML_LOG_LEVEL_ERROR);

0 commit comments

Comments
 (0)