Skip to content

Commit 36dafba

Browse files
llama: fix llama-model-saver (#20503)
* llama : add fd-based model loading via llama_model_load_from_fd * llama : address review feedback for fd-based model loading * llama : use FILE pointer instead of fd in public API * llama : use FILE pointer consistently, address review feedback * fixup * fix tensor names * fix llama-model-saver * roundtrip tests * fixup * refactor tests * fix prints * fix model saving * fix CI, disable Chameleon * print seed --------- Co-authored-by: Siddhesh2377 <siddheshsonar2377@gmail.com>
1 parent 69e0ece commit 36dafba

16 files changed

Lines changed: 338 additions & 99 deletions

ggml/include/gguf.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ extern "C" {
7777
};
7878

7979
GGML_API struct gguf_context * gguf_init_empty(void);
80+
GGML_API struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params);
8081
GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
8182
//GGML_API struct gguf_context * gguf_init_from_buffer(..);
8283

@@ -189,6 +190,7 @@ extern "C" {
189190
//
190191

191192
// write the entire context to a binary file
193+
GGML_API bool gguf_write_to_file_ptr(const struct gguf_context * ctx, FILE * file, bool only_meta);
192194
GGML_API bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
193195

194196
// get the size in bytes of the meta data (header, kv pairs, tensor info) including padding

ggml/src/ggml-impl.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -773,6 +773,5 @@ inline bool ggml_check_edges(const struct ggml_cgraph * cgraph,
773773

774774
// expose GGUF internals for test code
775775
GGML_API size_t gguf_type_size(enum gguf_type type);
776-
GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
777776
GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, std::vector<int8_t> & buf, bool only_meta);
778777
#endif // __cplusplus

ggml/src/gguf.cpp

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -394,7 +394,11 @@ bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vector<struct
394394
return true;
395395
}
396396

397-
struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params) {
397+
struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params) {
398+
if (!file) {
399+
return nullptr;
400+
}
401+
398402
const struct gguf_reader gr(file);
399403
struct gguf_context * ctx = new gguf_context;
400404

@@ -848,7 +852,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
848852
return nullptr;
849853
}
850854

851-
struct gguf_context * result = gguf_init_from_file_impl(file, params);
855+
struct gguf_context * result = gguf_init_from_file_ptr(file, params);
852856
fclose(file);
853857
return result;
854858
}
@@ -1508,6 +1512,19 @@ void gguf_write_to_buf(const struct gguf_context * ctx, std::vector<int8_t> & bu
15081512
gguf_write_out(ctx, gw, only_meta);
15091513
}
15101514

1515+
bool gguf_write_to_file_ptr(const struct gguf_context * ctx, FILE * file, bool only_meta) {
1516+
GGML_ASSERT(file);
1517+
1518+
try {
1519+
gguf_writer_file gw(file);
1520+
gguf_write_out(ctx, gw, only_meta);
1521+
} catch (const std::runtime_error& ex) {
1522+
GGML_LOG_ERROR("%s: failed to write GGUF data: %s\n", __func__, ex.what());
1523+
return false;
1524+
}
1525+
return true;
1526+
}
1527+
15111528
bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
15121529
FILE * file = ggml_fopen(fname, "wb");
15131530

@@ -1516,17 +1533,13 @@ bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, boo
15161533
return false;
15171534
}
15181535

1519-
try {
1520-
gguf_writer_file gw(file);
1521-
gguf_write_out(ctx, gw, only_meta);
1522-
} catch (const std::runtime_error& ex) {
1523-
GGML_LOG_ERROR("%s: failed to write GGUF data into '%s': %s\n", __func__, fname, ex.what());
1524-
fclose(file);
1525-
return false;
1536+
const bool success = gguf_write_to_file_ptr(ctx, file, only_meta);
1537+
if (!success) {
1538+
GGML_LOG_ERROR("%s: failed to write GGUF data into '%s'\n", __func__, fname);
15261539
}
15271540

15281541
fclose(file);
1529-
return true;
1542+
return success;
15301543
}
15311544

15321545
size_t gguf_get_meta_size(const struct gguf_context * ctx) {

include/llama.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -465,6 +465,11 @@ extern "C" {
465465
const char * path_model,
466466
struct llama_model_params params);
467467

468+
// Load a model from an open FILE pointer
469+
LLAMA_API struct llama_model * llama_model_load_from_file_ptr(
470+
FILE * file,
471+
struct llama_model_params params);
472+
468473
// Load a model from multiple splits (support custom naming scheme)
469474
// The paths must be in the correct order
470475
LLAMA_API struct llama_model * llama_model_load_from_splits(

src/llama-arch.cpp

Lines changed: 16 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -544,6 +544,10 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
544544
case LLM_ARCH_CLIP:
545545
return {};
546546
case LLM_ARCH_LLAMA:
547+
case LLM_ARCH_REFACT:
548+
case LLM_ARCH_MINICPM:
549+
case LLM_ARCH_GRANITE:
550+
case LLM_ARCH_GRANITE_MOE:
547551
case LLM_ARCH_DECI:
548552
case LLM_ARCH_MISTRAL3:
549553
case LLM_ARCH_LLAMA_EMBED:
@@ -744,11 +748,9 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
744748
LLM_TENSOR_ATTN_Q_NORM,
745749
LLM_TENSOR_ATTN_K_NORM,
746750
};
747-
case LLM_ARCH_REFACT:
748751
case LLM_ARCH_QWEN2:
749752
case LLM_ARCH_QWEN2VL:
750753
case LLM_ARCH_INTERNLM2:
751-
case LLM_ARCH_GRANITE:
752754
case LLM_ARCH_ERNIE4_5:
753755
case LLM_ARCH_PADDLEOCR:
754756
case LLM_ARCH_SMOLLM3:
@@ -759,6 +761,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
759761
LLM_TENSOR_TOKEN_EMBD,
760762
LLM_TENSOR_OUTPUT_NORM,
761763
LLM_TENSOR_OUTPUT,
764+
LLM_TENSOR_ROPE_FREQS,
762765
LLM_TENSOR_ATTN_NORM,
763766
LLM_TENSOR_ATTN_Q,
764767
LLM_TENSOR_ATTN_K,
@@ -1232,29 +1235,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
12321235
LLM_TENSOR_FFN_DOWN,
12331236
LLM_TENSOR_FFN_UP,
12341237
};
1235-
case LLM_ARCH_MINICPM:
1236-
return {
1237-
LLM_TENSOR_TOKEN_EMBD,
1238-
LLM_TENSOR_OUTPUT_NORM,
1239-
LLM_TENSOR_OUTPUT,
1240-
LLM_TENSOR_ROPE_FREQS,
1241-
LLM_TENSOR_ROPE_FACTORS_LONG,
1242-
LLM_TENSOR_ROPE_FACTORS_SHORT,
1243-
LLM_TENSOR_ATTN_NORM,
1244-
LLM_TENSOR_ATTN_Q,
1245-
LLM_TENSOR_ATTN_K,
1246-
LLM_TENSOR_ATTN_V,
1247-
LLM_TENSOR_ATTN_OUT,
1248-
LLM_TENSOR_ATTN_ROT_EMBD,
1249-
LLM_TENSOR_FFN_GATE_INP,
1250-
LLM_TENSOR_FFN_NORM,
1251-
LLM_TENSOR_FFN_GATE,
1252-
LLM_TENSOR_FFN_DOWN,
1253-
LLM_TENSOR_FFN_UP,
1254-
LLM_TENSOR_FFN_GATE_EXP,
1255-
LLM_TENSOR_FFN_DOWN_EXP,
1256-
LLM_TENSOR_FFN_UP_EXP,
1257-
};
12581238
case LLM_ARCH_MINICPM3:
12591239
return {
12601240
LLM_TENSOR_TOKEN_EMBD,
@@ -1442,6 +1422,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
14421422
LLM_TENSOR_TOKEN_EMBD,
14431423
LLM_TENSOR_OUTPUT,
14441424
LLM_TENSOR_OUTPUT_NORM,
1425+
LLM_TENSOR_ROPE_FREQS,
14451426
LLM_TENSOR_ATTN_NORM,
14461427
LLM_TENSOR_ATTN_Q,
14471428
LLM_TENSOR_ATTN_K,
@@ -1657,7 +1638,9 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
16571638
LLM_TENSOR_ROPE_FREQS,
16581639
LLM_TENSOR_OUTPUT_NORM,
16591640
LLM_TENSOR_OUTPUT,
1641+
LLM_TENSOR_TOKEN_EMBD,
16601642
LLM_TENSOR_ATTN_NORM,
1643+
LLM_TENSOR_ATTN_QKV,
16611644
LLM_TENSOR_ATTN_Q,
16621645
LLM_TENSOR_ATTN_K,
16631646
LLM_TENSOR_ATTN_V,
@@ -2061,30 +2044,12 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
20612044
LLM_TENSOR_FFN_DOWN,
20622045
LLM_TENSOR_FFN_UP,
20632046
};
2064-
case LLM_ARCH_GRANITE_MOE:
2065-
return {
2066-
LLM_TENSOR_TOKEN_EMBD,
2067-
LLM_TENSOR_OUTPUT_NORM,
2068-
LLM_TENSOR_OUTPUT,
2069-
LLM_TENSOR_ATTN_NORM,
2070-
LLM_TENSOR_ATTN_Q,
2071-
LLM_TENSOR_ATTN_K,
2072-
LLM_TENSOR_ATTN_V,
2073-
LLM_TENSOR_ATTN_OUT,
2074-
LLM_TENSOR_FFN_NORM,
2075-
LLM_TENSOR_FFN_GATE_INP,
2076-
LLM_TENSOR_FFN_GATE_EXPS,
2077-
LLM_TENSOR_FFN_DOWN_EXPS,
2078-
LLM_TENSOR_FFN_UP_EXPS,
2079-
LLM_TENSOR_FFN_GATE_SHEXP,
2080-
LLM_TENSOR_FFN_DOWN_SHEXP,
2081-
LLM_TENSOR_FFN_UP_SHEXP,
2082-
};
20832047
case LLM_ARCH_GRANITE_HYBRID:
20842048
return {
20852049
LLM_TENSOR_TOKEN_EMBD,
20862050
LLM_TENSOR_OUTPUT_NORM,
20872051
LLM_TENSOR_OUTPUT,
2052+
LLM_TENSOR_ROPE_FREQS,
20882053
LLM_TENSOR_ATTN_NORM,
20892054
LLM_TENSOR_SSM_IN,
20902055
LLM_TENSOR_SSM_CONV1D,
@@ -2412,6 +2377,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
24122377
LLM_TENSOR_TOKEN_EMBD,
24132378
LLM_TENSOR_OUTPUT_NORM,
24142379
LLM_TENSOR_OUTPUT,
2380+
LLM_TENSOR_ROPE_FREQS,
24152381
LLM_TENSOR_ATTN_NORM,
24162382
LLM_TENSOR_ATTN_QKV,
24172383
LLM_TENSOR_ATTN_OUT,
@@ -2789,7 +2755,12 @@ std::string LLM_TN_IMPL::str() const {
27892755
}
27902756

27912757
if (model_tensors.find(tensor) == model_tensors.end()) {
2792-
return LLM_TENSOR_NAMES.at(tensor);
2758+
const char * name = LLM_TENSOR_NAMES.at(tensor);
2759+
if (suffix != nullptr || bid != -1 || xid != -1) {
2760+
LLAMA_LOG_WARN("%s: cannot properly format tensor name %s with suffix=%s bid=%d xid=%d\n",
2761+
__func__, name, suffix, bid, xid);
2762+
}
2763+
return name;
27932764
}
27942765

27952766
std::string name = ::format(LLM_TENSOR_NAMES.at(tensor), bid, xid);

src/llama-mmap.cpp

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,14 @@ struct llama_file::impl {
8686
seek(0, SEEK_SET);
8787
}
8888

89+
impl(FILE * file) : owns_fp(false) {
90+
fp = file;
91+
fp_win32 = (HANDLE) _get_osfhandle(_fileno(fp));
92+
seek(0, SEEK_END);
93+
size = tell();
94+
seek(0, SEEK_SET);
95+
}
96+
8997
size_t tell() const {
9098
LARGE_INTEGER li;
9199
li.QuadPart = 0;
@@ -159,7 +167,7 @@ struct llama_file::impl {
159167
}
160168

161169
~impl() {
162-
if (fp) {
170+
if (fp && owns_fp) {
163171
std::fclose(fp);
164172
}
165173
}
@@ -209,6 +217,13 @@ struct llama_file::impl {
209217
seek(0, SEEK_SET);
210218
}
211219

220+
impl(FILE * file) : fname("(file*)"), owns_fp(false) {
221+
fp = file;
222+
seek(0, SEEK_END);
223+
size = tell();
224+
seek(0, SEEK_SET);
225+
}
226+
212227
size_t tell() const {
213228
if (fd == -1) {
214229
long ret = std::ftell(fp);
@@ -353,7 +368,7 @@ struct llama_file::impl {
353368
~impl() {
354369
if (fd != -1) {
355370
close(fd);
356-
} else {
371+
} else if (owns_fp) {
357372
std::fclose(fp);
358373
}
359374
}
@@ -369,10 +384,14 @@ struct llama_file::impl {
369384

370385
FILE * fp{};
371386
size_t size{};
387+
bool owns_fp = true;
372388
};
373389

374390
llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) :
375391
pimpl(std::make_unique<impl>(fname, mode, use_direct_io)) {}
392+
393+
llama_file::llama_file(FILE * file) : pimpl(std::make_unique<impl>(file)) {}
394+
376395
llama_file::~llama_file() = default;
377396

378397
size_t llama_file::tell() const { return pimpl->tell(); }

src/llama-mmap.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
1515

1616
struct llama_file {
1717
llama_file(const char * fname, const char * mode, bool use_direct_io = false);
18+
llama_file(FILE * file);
1819
~llama_file();
1920

2021
size_t tell() const;

src/llama-model-loader.cpp

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -511,6 +511,7 @@ llama_model_loader::llama_model_loader(
511511
void * set_tensor_data_ud,
512512
const std::string & fname,
513513
std::vector<std::string> & splits,
514+
FILE * file,
514515
bool use_mmap,
515516
bool use_direct_io,
516517
bool check_tensors,
@@ -658,6 +659,36 @@ llama_model_loader::llama_model_loader(
658659

659660
LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
660661
}
662+
} else if (file != nullptr) {
663+
struct ggml_context * ctx = NULL;
664+
struct gguf_init_params params = {
665+
/*.no_alloc = */ true,
666+
/*.ctx = */ &ctx,
667+
};
668+
669+
metadata_ptr.reset(gguf_init_from_file_ptr(file, params));
670+
metadata = metadata_ptr.get();
671+
if (metadata == nullptr) {
672+
throw std::runtime_error(format("%s: failed to load model from file pointer", __func__));
673+
}
674+
675+
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
676+
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
677+
678+
files.emplace_back(new llama_file(file));
679+
contexts.emplace_back(ctx);
680+
681+
// Save tensors data offset info of the main file.
682+
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
683+
std::string tensor_name = std::string(cur->name);
684+
// make sure there is no duplicated tensor names
685+
if (weights_map.find(tensor_name) != weights_map.end()) {
686+
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
687+
}
688+
n_elements += ggml_nelements(cur);
689+
n_bytes += ggml_nbytes(cur);
690+
weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, metadata, cur));
691+
}
661692
} else {
662693
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
663694
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
@@ -669,7 +700,7 @@ llama_model_loader::llama_model_loader(
669700
fver = (enum llama_fver) gguf_get_version(metadata);
670701

671702
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
672-
__func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
703+
__func__, n_kv, n_tensors, fname.empty() ? "(file*)" : fname.c_str(), llama_file_version_name(fver));
673704

674705
// determine file type based on the number of tensors for each quantization and print meta data
675706
// TODO: make optional

src/llama-model-loader.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@ struct llama_model_loader {
125125
void * set_tensor_data_ud,
126126
const std::string & fname,
127127
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
128+
FILE * file,
128129
bool use_mmap,
129130
bool use_direct_io,
130131
bool check_tensors,

0 commit comments

Comments
 (0)