CrazyForks
diff --git a/‎src/llama-arch.cpp‎
Lines changed: 1 addition & 0 deletions b/‎src/llama-arch.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/llama-arch.h‎
Lines changed: 1 addition & 0 deletions b/‎src/llama-arch.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/llama-hparams.cpp‎
Lines changed: 19 additions & 6 deletions b/‎src/llama-hparams.cpp‎
Lines changed: 19 additions & 6 deletions
diff --git a/‎src/llama-hparams.h‎
Lines changed: 19 additions & 12 deletions b/‎src/llama-hparams.h‎
Lines changed: 19 additions & 12 deletions
diff --git a/‎src/llama-memory-hybrid-iswa.cpp‎
Lines changed: 2 additions & 2 deletions b/‎src/llama-memory-hybrid-iswa.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/llama-memory-hybrid.cpp‎
Lines changed: 2 additions & 2 deletions b/‎src/llama-memory-hybrid.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/llama-model-loader.cpp‎
Lines changed: 4 additions & 4 deletions b/‎src/llama-model-loader.cpp‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/llama-model-saver.cpp‎
Lines changed: 4 additions & 4 deletions b/‎src/llama-model-saver.cpp‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/llama-model.cpp‎
Lines changed: 13 additions & 15 deletions b/‎src/llama-model.cpp‎
Lines changed: 13 additions & 15 deletions
diff --git a/‎src/models/falcon-h1.cpp‎
Lines changed: 1 addition & 1 deletion b/‎src/models/falcon-h1.cpp‎
Lines changed: 1 addition & 1 deletion
@@ -247,6 +247,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_ATTENTION_INDEXER_KEY_LENGTH,           "%s.attention.indexer.key_length"           },
     { LLM_KV_ATTENTION_INDEXER_TOP_K,                "%s.attention.indexer.top_k"                },
     { LLM_KV_ATTENTION_SHARED_KV_LAYERS,             "%s.attention.shared_kv_layers"             },
+    { LLM_KV_ATTENTION_RECURRENT_LAYERS,             "%s.attention.recurrent_layers"             },
 
     { LLM_KV_ROPE_DIMENSION_COUNT,           "%s.rope.dimension_count"                 },
     { LLM_KV_ROPE_DIMENSION_COUNT_SWA,       "%s.rope.dimension_count_swa"             },
 
@@ -251,6 +251,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_INDEXER_KEY_LENGTH,
     LLM_KV_ATTENTION_INDEXER_TOP_K,
     LLM_KV_ATTENTION_SHARED_KV_LAYERS,
+    LLM_KV_ATTENTION_RECURRENT_LAYERS,
 
     LLM_KV_ROPE_DIMENSION_COUNT,
     LLM_KV_ROPE_DIMENSION_COUNT_SWA,
 
@@ -8,18 +8,31 @@
 void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
     if (dense_first) {
         for (uint32_t il = 0; il < n_layer; ++il) {
-            swa_layers[il] = n_pattern == 0 || (il % n_pattern != 0);
+            is_swa_impl[il] = n_pattern == 0 || (il % n_pattern != 0);
         }
     } else {
         for (uint32_t il = 0; il < n_layer; ++il) {
-            swa_layers[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
+            is_swa_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
         }
     }
 }
 
+// TODO: implement
+//void llama_hparams::set_recr_pattern(uint32_t n_pattern, bool dense_first) {
+//    if (dense_first) {
+//        for (uint32_t il = 0; il < n_layer; ++il) {
+//            is_recr_impl[il] = n_pattern == 0 || (il % n_pattern != 0);
+//        }
+//    } else {
+//        for (uint32_t il = 0; il < n_layer; ++il) {
+//            is_recr_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
+//        }
+//    }
+//}
+
 bool llama_hparams::is_swa_any() const {
     for (uint32_t il = 0; il < n_layer; ++il) {
-        if (swa_layers[il]) {
+        if (is_swa_impl[il]) {
             return true;
         }
     }
@@ -193,9 +206,9 @@ uint32_t llama_hparams::n_embd_s() const {
     return ssm_d_state * ssm_d_inner;
 }
 
-bool llama_hparams::is_recurrent(uint32_t il) const {
+bool llama_hparams::is_recr(uint32_t il) const {
     if (il < n_layer) {
-        return recurrent_layer_arr[il];
+        return is_recr_impl[il];
     }
 
     GGML_ABORT("%s: il (%u) out of bounds (n_layer: %u)\n", __func__, il, n_layer);
@@ -207,7 +220,7 @@ uint32_t llama_hparams::n_pos_per_embd() const {
 
 bool llama_hparams::is_swa(uint32_t il) const {
     if (il < n_layer) {
-        return swa_layers[il];
+        return is_swa_impl[il];
     }
 
     GGML_ABORT("fatal error");
 
@@ -37,6 +37,9 @@ struct llama_hparams_convnext {
 };
 
 struct llama_hparams {
+    // note: use the `_impl` suffix to avoid name conflict between members and getters
+    //       for example: n_embd_out() vs n_embd_out_impl
+
     bool vocab_only;
     bool no_alloc;
     bool rope_finetuned;
@@ -46,7 +49,7 @@ struct llama_hparams {
     uint32_t n_ctx_train; // context size the model was trained on
     uint32_t n_embd;
     uint32_t n_layer;
-    int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
+    int32_t  n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
     uint32_t n_expert = 0;
     uint32_t n_expert_used = 0;
     uint32_t n_rel_attn_bkts = 0;
@@ -137,11 +140,15 @@ struct llama_hparams {
     llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
     // the size of the sliding window (0 - no SWA)
     uint32_t n_swa = 0;
-    // if swa_layers[il] == 1, then layer il is SWA
-    // if swa_layers[il] == 0, then layer il is dense (i.e. non-SWA)
+
+    // if is_swa_impl[il] == 1, then layer il is SWA
+    // if is_swa_impl[il] == 0, then layer il is dense (i.e. non-SWA)
     // by default, all layers are dense
     // note: using uint32_t type for compatibility reason
-    std::array<uint32_t, LLAMA_MAX_LAYERS> swa_layers;
+    std::array<uint32_t, LLAMA_MAX_LAYERS> is_swa_impl;
+
+    // for hybrid state space models
+    std::array<uint32_t, LLAMA_MAX_LAYERS> is_recr_impl;
 
     // for State Space Models
     uint32_t ssm_d_conv  = 0;
@@ -153,9 +160,6 @@ struct llama_hparams {
     // for Kimi Linear KDA
     uint32_t n_embd_head_kda = 0;
 
-    // for hybrid state space models
-    std::array<bool, LLAMA_MAX_LAYERS> recurrent_layer_arr;
-
     bool ssm_dt_b_c_rms = false;
 
     float f_clamp_kqv      = 0.0f;
@@ -266,6 +270,14 @@ struct llama_hparams {
     // return true if one of the layers is SWA
     bool is_swa_any() const;
 
+    bool is_swa(uint32_t il) const;
+
+    // TODO: implement
+    //void set_recr_pattern(uint32_t n_pattern, bool dense_first = false);
+
+    // whether or not the given layer is recurrent (for hybrid models)
+    bool is_recr(uint32_t il) const;
+
     uint32_t n_head(uint32_t il = 0) const;
 
     uint32_t n_head_kv(uint32_t il = 0) const;
@@ -307,13 +319,8 @@ struct llama_hparams {
     // dimension of the recurrent state embeddings
     uint32_t n_embd_s() const;
 
-    // whether or not the given layer is recurrent (for hybrid models)
-    bool is_recurrent(uint32_t il) const;
-
     uint32_t n_pos_per_embd() const;
 
-    bool is_swa(uint32_t il) const;
-
     // note: currently only support if either all or none of the layers are MLA
     bool is_mla() const;
 
 
@@ -44,7 +44,7 @@ llama_memory_hybrid_iswa::llama_memory_hybrid_iswa(
         n_ubatch,
         n_pad,
         filter_attn == nullptr ?
-            [&](int32_t il) { return !hparams.is_recurrent(il); }
+            [&](int32_t il) { return !hparams.is_recr(il); }
             : filter_attn,
         nullptr
     )),
@@ -57,7 +57,7 @@ llama_memory_hybrid_iswa::llama_memory_hybrid_iswa(
         n_seq_max,
         n_rs_seq,
         filter_recr == nullptr ?
-            [&](int32_t il) { return hparams.is_recurrent(il); }
+            [&](int32_t il) { return hparams.is_recr(il); }
             : filter_recr
     )) {}
 
 
@@ -45,7 +45,7 @@ llama_memory_hybrid::llama_memory_hybrid(
         n_swa,
         swa_type,
         filter_attn == nullptr ?
-            [&](int32_t il) { return !hparams.is_recurrent(il); }
+            [&](int32_t il) { return !hparams.is_recr(il); }
             : filter_attn,
         nullptr
     )),
@@ -58,7 +58,7 @@ llama_memory_hybrid::llama_memory_hybrid(
         n_seq_max,
         n_rs_seq,
         filter_recr == nullptr ?
-            [&](int32_t il) { return hparams.is_recurrent(il); }
+            [&](int32_t il) { return hparams.is_recr(il); }
             : filter_recr
     )) {}
 
 
@@ -146,7 +146,7 @@ namespace GGUFMeta {
             const enum gguf_type arr_type = gguf_get_arr_type(ctx, k);
             return ArrayInfo {
                 arr_type,
-                size_t(gguf_get_arr_n(ctx, k)),
+                gguf_get_arr_n(ctx, k),
                 arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx, k),
             };
         }
@@ -445,7 +445,7 @@ namespace GGUFMeta {
         }
 
         if (n > N_MAX) {
-            throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str()));
+            throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", n, (uint32_t) N_MAX, key.c_str()));
         }
 
         if (gguf_get_kv_type(metadata, kid) == GGUF_TYPE_ARRAY) {
@@ -502,9 +502,9 @@ namespace GGUFMeta {
     }
 
     // TODO: this is not very clever - figure out something better
-    template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
+    template bool llama_model_loader::get_key_or_arr<std::array<int,      4>>  (enum llm_kv kid, std::array<int,      4>   & result, uint32_t n, bool required);
     template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
-    template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required);
+    template bool llama_model_loader::get_key_or_arr<std::array<float,    512>>(enum llm_kv kid, std::array<float,    512> & result, uint32_t n, bool required);
 
 
 llama_model_loader::llama_model_loader(
 
@@ -14,9 +14,6 @@
 
 bool llama_model_saver_supports_arch(llm_arch arch) {
     switch (arch) {
-        case LLM_ARCH_QWEN3NEXT:
-        case LLM_ARCH_QWEN35:
-        case LLM_ARCH_QWEN35MOE:
         case LLM_ARCH_PLAMO3:
         case LLM_ARCH_GEMMA3:
         case LLM_ARCH_GEMMA3N:
@@ -107,6 +104,8 @@ void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, c
         gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_INT8, value.data(), n_values);
     } else if (std::is_same<typename Container::value_type, uint32_t>::value) {
         gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_UINT32, value.data(), n_values);
+    } else if (std::is_same<typename Container::value_type, bool>::value) {
+        gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_BOOL, value.data(), n_values);
     } else if (std::is_same<typename Container::value_type, int32_t>::value) {
         gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_INT32, value.data(), n_values);
     } else if (std::is_same<typename Container::value_type, float>::value) {
@@ -245,7 +244,7 @@ void llama_model_saver::add_kv_from_model() {
     add_kv(LLM_KV_EMBEDDING_SCALE,                   hparams.f_embedding_scale);
     add_kv(LLM_KV_TOKEN_SHIFT_COUNT,                 hparams.token_shift_count);
     add_kv(LLM_KV_INTERLEAVE_MOE_LAYER_STEP,         hparams.n_moe_layer_step);
-    // add_kv(LLM_KV_FULL_ATTENTION_INTERVAL,           ???);
+    // add_kv(LLM_KV_FULL_ATTENTION_INTERVAL,           ???); // saved as LLM_KV_ATTENTION_RECURRENT_LAYERS instead
 
     add_kv(LLM_KV_ATTENTION_HEAD_COUNT,              hparams.n_head_arr, true);
     add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV,           hparams.n_head_kv_arr, true);
@@ -279,6 +278,7 @@ void llama_model_saver::add_kv_from_model() {
     add_kv(LLM_KV_ATTENTION_INDEXER_HEAD_COUNT,      hparams.indexer_n_head);
     add_kv(LLM_KV_ATTENTION_INDEXER_KEY_LENGTH,      hparams.indexer_head_size);
     add_kv(LLM_KV_ATTENTION_INDEXER_TOP_K,           hparams.indexer_top_k);
+    add_kv(LLM_KV_ATTENTION_RECURRENT_LAYERS,        hparams.is_recr_impl, true);
 
     const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train;
 
 
@@ -373,10 +373,10 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str
         //     count only the same type of previous layers to avoid this
         auto get_il_eff = [&](const size_t il){
             size_t ret = 0;
-            const bool il_is_recurrent = hparams.is_recurrent(il);
-            const bool il_is_swa       = hparams.is_swa(il);
+            const bool il_is_recr = hparams.is_recr(il);
+            const bool il_is_swa  = hparams.is_swa(il);
             for (size_t il_prev = 0; il_prev < il; il_prev++) {
-                ret += hparams.is_recurrent(il_prev) == il_is_recurrent && hparams.is_swa(il_prev) == il_is_swa;
+                ret += hparams.is_recr(il_prev) == il_is_recr && hparams.is_swa(il_prev) == il_is_swa;
             }
             return ret;
         };
@@ -553,7 +553,7 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str
     };
 
     auto get_split_granularity = [&](int64_t blck_size, uint32_t il, const std::vector<std::pair<int64_t, uint32_t>> & segments) -> std::vector<int64_t> {
-        if (hparams.is_recurrent(il)) {
+        if (hparams.is_recr(il)) {
             // linear attention
             const int64_t head_dim  = hparams.ssm_d_state;
             const int64_t granularity_qkv = std::lcm(blck_size, head_dim);
@@ -1076,18 +1076,16 @@ void llama_model_base::load_hparams(llama_model_loader & ml) {
     std::fill(hparams.n_head_arr.begin(),    hparams.n_head_arr.end(),    0);
     std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
     std::fill(hparams.n_ff_arr.begin(),      hparams.n_ff_arr.end(),      0);
-    std::fill(
-        hparams.recurrent_layer_arr.begin(),
-        hparams.recurrent_layer_arr.end(),
-        llm_arch_is_recurrent(ml.get_arch()));
 
     std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
-    std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
+    std::fill(hparams.is_swa_impl.begin(),   hparams.is_swa_impl.end(), 0);
+    std::fill(hparams.is_recr_impl.begin(),  hparams.is_recr_impl.end(),  llm_arch_is_recurrent(ml.get_arch()) ? 1 : 0);
 
     std::fill(hparams.xielu_alpha_n.begin(), hparams.xielu_alpha_n.end(), 0.0f);
     std::fill(hparams.xielu_alpha_p.begin(), hparams.xielu_alpha_p.end(), 0.0f);
-    std::fill(hparams.xielu_beta.begin(), hparams.xielu_beta.end(), 0.0f);
-    std::fill(hparams.xielu_eps.begin(), hparams.xielu_eps.end(), 0.0f);
+    std::fill(hparams.xielu_beta.begin(),    hparams.xielu_beta.end(), 0.0f);
+    std::fill(hparams.xielu_eps.begin(),     hparams.xielu_eps.end(), 0.0f);
+
     std::fill(hparams.swiglu_clamp_exp.begin(),   hparams.swiglu_clamp_exp.end(),   0.0f);
     std::fill(hparams.swiglu_clamp_shexp.begin(), hparams.swiglu_clamp_shexp.end(), 0.0f);
 
@@ -2040,18 +2038,18 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                         filter_recr = [&](int32_t) { return true; };
                     } else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) {
                         filter_attn = [&](int32_t il) {
-                            return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
+                            return !hparams.is_recr(il) && hparams.n_ff(il) == 0;
                         };
                         filter_recr = [&](int32_t il) {
-                            return hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
+                            return hparams.is_recr(il) && hparams.n_ff(il) == 0;
                         };
                     } else if (arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE) {
                         const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers;
                         filter_attn = [&, n_main](int32_t il) {
-                            return (uint32_t)il < n_main && !hparams.is_recurrent(il);
+                            return (uint32_t)il < n_main && !hparams.is_recr(il);
                         };
                         filter_recr = [&, n_main](int32_t il) {
-                            return (uint32_t)il < n_main && hparams.is_recurrent(il);
+                            return (uint32_t)il < n_main && hparams.is_recr(il);
                         };
                     }
 
 
@@ -11,7 +11,7 @@ void llama_model_falcon_h1::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
     ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
 
-    std::fill(hparams.recurrent_layer_arr.begin(), hparams.recurrent_layer_arr.end(), true);
+    std::fill(hparams.is_recr_impl.begin(), hparams.is_recr_impl.end(), true);
 
     switch (hparams.n_layer) {
         case 36:
Original file line number	Diff line number	Diff line change
`@@ -8,18 +8,31 @@`
`8`	`8`	`void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {`
`9`	`9`	`if (dense_first) {`
`10`	`10`	`for (uint32_t il = 0; il < n_layer; ++il) {`
`11`		`- swa_layers[il] = n_pattern == 0 \|\| (il % n_pattern != 0);`
	`11`	`+ is_swa_impl[il] = n_pattern == 0 \|\| (il % n_pattern != 0);`
`12`	`12`	`}`
`13`	`13`	`} else {`
`14`	`14`	`for (uint32_t il = 0; il < n_layer; ++il) {`
`15`		`- swa_layers[il] = n_pattern == 0 \|\| (il % n_pattern < (n_pattern - 1));`
	`15`	`+ is_swa_impl[il] = n_pattern == 0 \|\| (il % n_pattern < (n_pattern - 1));`
`16`	`16`	`}`
`17`	`17`	`}`
`18`	`18`	`}`
`19`	`19`
	`20`	`+// TODO: implement`
	`21`	`+//void llama_hparams::set_recr_pattern(uint32_t n_pattern, bool dense_first) {`
	`22`	`+// if (dense_first) {`
	`23`	`+// for (uint32_t il = 0; il < n_layer; ++il) {`
	`24`	`+// is_recr_impl[il] = n_pattern == 0 \|\| (il % n_pattern != 0);`
	`25`	`+// }`
	`26`	`+// } else {`
	`27`	`+// for (uint32_t il = 0; il < n_layer; ++il) {`
	`28`	`+// is_recr_impl[il] = n_pattern == 0 \|\| (il % n_pattern < (n_pattern - 1));`
	`29`	`+// }`
	`30`	`+// }`
	`31`	`+//}`
	`32`	`+`
`20`	`33`	`bool llama_hparams::is_swa_any() const {`
`21`	`34`	`for (uint32_t il = 0; il < n_layer; ++il) {`
`22`		`- if (swa_layers[il]) {`
	`35`	`+ if (is_swa_impl[il]) {`
`23`	`36`	`return true;`
`24`	`37`	`}`
`25`	`38`	`}`
`@@ -193,9 +206,9 @@ uint32_t llama_hparams::n_embd_s() const {`
`193`	`206`	`return ssm_d_state * ssm_d_inner;`
`194`	`207`	`}`
`195`	`208`
`196`		`-bool llama_hparams::is_recurrent(uint32_t il) const {`
	`209`	`+bool llama_hparams::is_recr(uint32_t il) const {`
`197`	`210`	`if (il < n_layer) {`
`198`		`- return recurrent_layer_arr[il];`
	`211`	`+ return is_recr_impl[il];`
`199`	`212`	`}`
`200`	`213`
`201`	`214`	`GGML_ABORT("%s: il (%u) out of bounds (n_layer: %u)\n", __func__, il, n_layer);`
`@@ -207,7 +220,7 @@ uint32_t llama_hparams::n_pos_per_embd() const {`
`207`	`220`
`208`	`221`	`bool llama_hparams::is_swa(uint32_t il) const {`
`209`	`222`	`if (il < n_layer) {`
`210`		`- return swa_layers[il];`
	`223`	`+ return is_swa_impl[il];`
`211`	`224`	`}`
`212`	`225`
`213`	`226`	`GGML_ABORT("fatal error");`
Original file line number	Diff line number	Diff line change
`@@ -146,7 +146,7 @@ namespace GGUFMeta {`
`146`	`146`	`const enum gguf_type arr_type = gguf_get_arr_type(ctx, k);`
`147`	`147`	`return ArrayInfo {`
`148`	`148`	`arr_type,`
`149`		`- size_t(gguf_get_arr_n(ctx, k)),`
	`149`	`+ gguf_get_arr_n(ctx, k),`
`150`	`150`	`arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx, k),`
`151`	`151`	`};`
`152`	`152`	`}`
`@@ -445,7 +445,7 @@ namespace GGUFMeta {`
`445`	`445`	`}`
`446`	`446`
`447`	`447`	`if (n > N_MAX) {`
`448`		`- throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str()));`
	`448`	`+ throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", n, (uint32_t) N_MAX, key.c_str()));`
`449`	`449`	`}`
`450`	`450`
`451`	`451`	`if (gguf_get_kv_type(metadata, kid) == GGUF_TYPE_ARRAY) {`
`@@ -502,9 +502,9 @@ namespace GGUFMeta {`
`502`	`502`	`}`
`503`	`503`
`504`	`504`	`// TODO: this is not very clever - figure out something better`
`505`		`- template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);`
	`505`	`+ template bool llama_model_loader::get_key_or_arr<std::array<int, 4>> (enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);`
`506`	`506`	`template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);`
`507`		`- template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required);`
	`507`	`+ template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required);`
`508`	`508`
`509`	`509`
`510`	`510`	`llama_model_loader::llama_model_loader(`