From e48034dfc9e5705248fd39dc437ca887dc55a528 Mon Sep 17 00:00:00 2001
From: Aldehir Rojas <hello@alde.dev>
Date: Sun, 3 May 2026 17:18:23 -0500
Subject: [PATCH 01/13] common : determine generation prompt using longest
 common prefix (#22657)

---
 common/chat.cpp | 45 +++++++++++++++++++++++++++------------------
 1 file changed, 27 insertions(+), 18 deletions(-)

diff --git a/common/chat.cpp b/common/chat.cpp
index 159d625de99f..ade142c5cb60 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -2116,22 +2116,38 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
     return std::nullopt;
 }
 
+static std::string common_chat_templates_generation_prompt(const common_chat_template & tmpl, const autoparser::generation_params & inputs) {
+    autoparser::generation_params params = inputs;
+    params.add_generation_prompt = false;
+    std::string no_gen_prompt    = common_chat_template_direct_apply_impl(tmpl, params);
+    params.add_generation_prompt = true;
+    std::string gen_prompt       = common_chat_template_direct_apply_impl(tmpl, params);
+
+    size_t prefix_len = 0;
+    size_t min_size = std::min(no_gen_prompt.size(), gen_prompt.size());
+    while (prefix_len < min_size && no_gen_prompt[prefix_len] == gen_prompt[prefix_len]) {
+        prefix_len++;
+    }
+    return gen_prompt.substr(prefix_len);
+}
+
 static common_chat_params common_chat_templates_apply_jinja(const struct common_chat_templates *        tmpls,
                                                             const struct common_chat_templates_inputs & inputs) {
     autoparser::generation_params params;
     params.tools = common_chat_tools_to_json_oaicompat(inputs.tools);
     const auto & tmpl =
         params.tools.is_array() && tmpls->template_tool_use ? *tmpls->template_tool_use : *tmpls->template_default;
-    const auto & src        = tmpl.source();
-    const auto & caps       = tmpl.original_caps();
-    params.messages         = render_message_to_json(inputs.messages, tmpl.original_caps());
-    params.tool_choice      = inputs.tool_choice;
-    params.reasoning_format = inputs.reasoning_format;
-    params.enable_thinking  = inputs.enable_thinking;
-    params.grammar          = inputs.grammar;
-    params.now              = inputs.now;
-    params.add_bos          = tmpls->add_bos;
-    params.add_eos          = tmpls->add_eos;
+    const auto & src             = tmpl.source();
+    const auto & caps            = tmpl.original_caps();
+    params.messages              = render_message_to_json(inputs.messages, tmpl.original_caps());
+    params.tool_choice           = inputs.tool_choice;
+    params.reasoning_format      = inputs.reasoning_format;
+    params.enable_thinking       = inputs.enable_thinking;
+    params.grammar               = inputs.grammar;
+    params.now                   = inputs.now;
+    params.add_generation_prompt = inputs.add_generation_prompt;
+    params.add_bos               = tmpls->add_bos;
+    params.add_eos               = tmpls->add_eos;
 
     if (src.find("<|channel|>") == std::string::npos) {
         // map developer to system for all models except for GPT-OSS
@@ -2153,14 +2169,7 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
         workaround::func_args_not_string(params.messages);
     }
 
-    params.add_generation_prompt = false;
-    std::string no_gen_prompt    = common_chat_template_direct_apply_impl(tmpl, params);
-    params.add_generation_prompt = true;
-    std::string gen_prompt       = common_chat_template_direct_apply_impl(tmpl, params);
-    auto        diff             = calculate_diff_split(no_gen_prompt, gen_prompt);
-    params.generation_prompt     = diff.right + diff.suffix;
-
-    params.add_generation_prompt = inputs.add_generation_prompt;
+    params.generation_prompt = common_chat_templates_generation_prompt(tmpl, params);
 
     params.extra_context = common_chat_extra_context();
     for (auto el : inputs.chat_template_kwargs) {

From d4b0c22f9e67f0295e91dc1ab4f17c0fb2557fa4 Mon Sep 17 00:00:00 2001
From: Chen Yuan <constantchen525@gmail.com>
Date: Sun, 3 May 2026 23:52:53 -0400
Subject: [PATCH 02/13] ggml-webgpu: add layer norm ops (#22406)

* shader(norm): add layer norm ops

* shader(norm): stablize floating point computation with Kahan summation and handle mixed types

* shader(norm): remove the non-contiguous strides

* shader(norm): use the original implementation rather than the kahan summation
---
 .../ggml-webgpu/ggml-webgpu-shader-lib.hpp    | 32 +++++-
 ggml/src/ggml-webgpu/ggml-webgpu.cpp          |  2 +
 .../ggml-webgpu/wgsl-shaders/row_norm.wgsl    | 97 +++++++++++++++----
 3 files changed, 107 insertions(+), 24 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp b/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
index cff93b8d1705..c6dc2c21147c 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
@@ -228,11 +228,13 @@ struct ggml_webgpu_get_rows_pipeline_key_hash {
 /** Row Norm **/
 
 struct ggml_webgpu_row_norm_pipeline_key {
-    ggml_op op;
-    bool    inplace;
+    ggml_op   op;
+    ggml_type src_type;
+    ggml_type dst_type;
+    bool      inplace;
 
     bool operator==(const ggml_webgpu_row_norm_pipeline_key & other) const {
-        return op == other.op && inplace == other.inplace;
+        return op == other.op && src_type == other.src_type && dst_type == other.dst_type && inplace == other.inplace;
     }
 };
 
@@ -240,6 +242,8 @@ struct ggml_webgpu_row_norm_pipeline_key_hash {
     size_t operator()(const ggml_webgpu_row_norm_pipeline_key & key) const {
         size_t seed = 0;
         ggml_webgpu_hash_combine(seed, key.op);
+        ggml_webgpu_hash_combine(seed, key.src_type);
+        ggml_webgpu_hash_combine(seed, key.dst_type);
         ggml_webgpu_hash_combine(seed, key.inplace);
         return seed;
     }
@@ -1097,6 +1101,8 @@ class ggml_webgpu_shader_lib {
     webgpu_pipeline get_row_norm_pipeline(const ggml_webgpu_shader_lib_context & context) {
         ggml_webgpu_row_norm_pipeline_key key = {};
         key.op                                = context.dst->op;
+        key.src_type                          = context.src0->type;
+        key.dst_type                          = context.dst->type;
         key.inplace                           = ggml_webgpu_tensor_equal(context.src0, context.dst);
 
         auto it = row_norm_pipelines.find(key);
@@ -1111,6 +1117,10 @@ class ggml_webgpu_shader_lib {
                 defines.push_back("RMS_NORM");
                 variant = "rms_norm";
                 break;
+            case GGML_OP_NORM:
+                defines.push_back("NORM");
+                variant = "norm";
+                break;
             case GGML_OP_L2_NORM:
                 defines.push_back("L2_NORM");
                 variant = "l2_norm";
@@ -1124,6 +1134,22 @@ class ggml_webgpu_shader_lib {
             variant += "_inplace";
         }
 
+        if (key.src_type == GGML_TYPE_F32) {
+            defines.push_back("SRC_F32");
+            variant += "_src_f32";
+        } else if (key.src_type == GGML_TYPE_F16) {
+            defines.push_back("SRC_F16");
+            variant += "_src_f16";
+        }
+
+        if (key.dst_type == GGML_TYPE_F32) {
+            defines.push_back("DST_F32");
+            variant += "_dst_f32";
+        } else if (key.dst_type == GGML_TYPE_F16) {
+            defines.push_back("DST_F16");
+            variant += "_dst_f16";
+        }
+
         const uint32_t row_norm_wg_size = 128u;
         uint32_t       wg_size          = std::min(context.max_wg_size, row_norm_wg_size);
         defines.push_back(std::string("WG_SIZE=") + std::to_string(wg_size));
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index cab0aead1987..12f60a9900ea 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -2927,6 +2927,7 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_encode(webgpu_context ctx,
             } else {
                 return ggml_webgpu_row_norm(ctx, src0, node);
             }
+        case GGML_OP_NORM:
         case GGML_OP_L2_NORM:
             return ggml_webgpu_row_norm(ctx, src0, node);
         case GGML_OP_ROPE:
@@ -4071,6 +4072,7 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
                 break;
             }
         case GGML_OP_RMS_NORM:
+        case GGML_OP_NORM:
         case GGML_OP_L2_NORM:
             supports_op = op->type == GGML_TYPE_F32 && src0->type == GGML_TYPE_F32;
             break;
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl
index bd8d32bded75..5eaf5e7bbe5d 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/row_norm.wgsl
@@ -1,20 +1,17 @@
-#ifdef INPLACE
-fn update(src_offset: u32, dst_offset: u32, scale: f32) {
-    src[dst_offset] = scale * src[src_offset];
-}
+#if defined(SRC_F16) || defined(DST_F16)
+enable f16;
+#endif
 
-@group(0) @binding(1)
-var<uniform> params: Params;
+#ifdef SRC_F16
+#define SRC_TYPE f16
 #else
-fn update(src_offset: u32, dst_offset: u32, scale: f32) {
-    dst[dst_offset] = scale * src[src_offset];
-}
-
-@group(0) @binding(1)
-var<storage, read_write> dst: array<f32>;
+#define SRC_TYPE f32
+#endif
 
-@group(0) @binding(2)
-var<uniform> params: Params;
+#ifdef DST_F16
+#define DST_TYPE f16
+#else
+#define DST_TYPE f32
 #endif
 
 struct Params {
@@ -40,9 +37,20 @@ struct Params {
 };
 
 @group(0) @binding(0)
-var<storage, read_write> src: array<f32>;
+var<storage, read_write> src: array<SRC_TYPE>;
 
-var<workgroup> scratch: array<f32, WG_SIZE>;
+#ifdef INPLACE
+@group(0) @binding(1)
+var<uniform> params: Params;
+#else
+@group(0) @binding(1)
+var<storage, read_write> dst: array<DST_TYPE>;
+
+@group(0) @binding(2)
+var<uniform> params: Params;
+#endif
+
+var<workgroup> scratch: array<f32, WG_SIZE * 2u>;
 
 @compute @workgroup_size(WG_SIZE)
 fn main(@builtin(workgroup_id) wid: vec3<u32>,
@@ -65,34 +73,81 @@ fn main(@builtin(workgroup_id) wid: vec3<u32>,
         if (col >= params.ne0) {
             break;
         }
-        sum += pow(src[i_src_row + col], 2.0);
+        let v = f32(src[i_src_row + col]);
+#ifdef NORM
+        sum += v;
+#else
+        sum += v * v;
+#endif
         col += WG_SIZE;
     }
 
     scratch[lid.x] = sum;
     workgroupBarrier();
-    var offset: u32 = WG_SIZE / 2;
+
+    var offset: u32 = WG_SIZE / 2u;
     while (offset > 0) {
         if (lid.x < offset) {
             scratch[lid.x] += scratch[lid.x + offset];
         }
-        offset = offset / 2;
+        offset /= 2u;
         workgroupBarrier();
     }
     sum = scratch[0];
 
-#ifdef RMS_NORM
+#ifdef NORM
+    let mean = sum / f32(params.ne0);
+    var sq_sum = 0.0f;
+    col = lid.x;
+    for (var j: u32 = 0; j < elems; j++) {
+        if (col >= params.ne0) {
+            break;
+        }
+        let v = f32(src[i_src_row + col]);
+        let d = v - mean;
+        sq_sum += d * d;
+        col += WG_SIZE;
+    }
+
+    workgroupBarrier();
+    scratch[lid.x] = sq_sum;
+    workgroupBarrier();
+    offset = WG_SIZE / 2u;
+    while (offset > 0) {
+        if (lid.x < offset) {
+            scratch[lid.x] += scratch[lid.x + offset];
+        }
+        offset /= 2u;
+        workgroupBarrier();
+    }
+
+    let variance = scratch[0] / f32(params.ne0);
+    let scale = 1.0 / sqrt(variance + params.eps);
+#elif defined(RMS_NORM)
     let scale = 1.0/sqrt(sum/f32(params.ne0) + params.eps);
 #elif defined(L2_NORM)
     let scale = 1.0/max(sqrt(sum), params.eps);
 #endif
 
+#ifdef NORM
+    let mean_val = mean;
+#else
+    let mean_val = 0.0f;
+#endif
+
     col = lid.x;
     for (var j: u32 = 0; j < elems; j++) {
         if (col >= params.ne0) {
             break;
         }
-        update(i_src_row + col, i_dst_row + col, scale);
+        let i_src = i_src_row + col;
+        let i_dst = i_dst_row + col;
+        let v = src[i_src];
+#ifdef INPLACE
+        src[i_dst] = scale * (v - mean_val);
+#else
+        dst[i_dst] = scale * (v - mean_val);
+#endif
         col += WG_SIZE;
     }
 }

From 6dcd824fce51e4b03ba2064f83990ff5fee85aeb Mon Sep 17 00:00:00 2001
From: Atomic-Germ <97569476+Atomic-Germ@users.noreply.github.com>
Date: Sun, 3 May 2026 22:49:29 -0700
Subject: [PATCH 03/13] vulkan: delete dead GGML_VK_MAX_NODES def (#22621)

---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index c2f1883328f0..423e01dbff17 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -111,8 +111,6 @@ static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; }
 
 #define VK_DEVICE_DESCRIPTOR_POOL_SIZE 256
 
-#define GGML_VK_MAX_NODES 8192
-
 #define VK_CHECK(err, msg)                                          \
     do {                                                            \
         vk::Result err_ = (err);                                    \

From 846262d7875dcabf502a150fa3d7b9c770dde7eb Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 4 May 2026 08:52:07 +0300
Subject: [PATCH 04/13] docs : update speculative decoding parameters after
 refactor (#22397) (#22539)

* docs : update speculative decoding parameters after refactor (#22397)

Update docs/speculative.md to reflect the new parameter naming scheme
introduced in PR #22397:

- Replace --draft-max/--draft-min with --spec-draft-n-max/--spec-draft-n-min
- Replace --spec-ngram-size-n/m with per-implementation variants
- Add documentation for all new --spec-ngram-*- parameters
- Update all example commands

Assisted-by: llama.cpp:local pi

* pi : add rule to use gh CLI for GitHub resources

Assisted-by: llama.cpp:local pi

* docs : run llama-gen-docs

* arg : fix typo
---
 .pi/gg/SYSTEM.md           |   1 +
 common/arg.cpp             |   2 +-
 docs/speculative.md        | 129 +++++++++++++++++++++++++++++++------
 tools/cli/README.md        |  64 ++++++++++++------
 tools/completion/README.md |  13 ++--
 tools/server/README.md     |  69 +++++++++++++-------
 6 files changed, 209 insertions(+), 69 deletions(-)

diff --git a/.pi/gg/SYSTEM.md b/.pi/gg/SYSTEM.md
index 5de6fe4eb981..727a850b1833 100644
--- a/.pi/gg/SYSTEM.md
+++ b/.pi/gg/SYSTEM.md
@@ -4,6 +4,7 @@ General:
 - By very precise and concise when writing code, comments, explanations, etc.
 - PR and commit titles format: `<module> : <title>`. Lookup recents for examples
 - Don't try to build or run the code unless you are explicitly asked to do so
+- Use the `gh` CLI tool when querying PRs, issues, or other GitHub resources
 
 Coding:
 - When in doubt, always refer to the CONTRIBUTING.md file of the project
diff --git a/common/arg.cpp b/common/arg.cpp
index c21598e7687f..dd8ce40a6159 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -3380,7 +3380,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
     add_opt(common_arg(
         {"--spec-draft-poll", "--poll-draft"}, "<0|1>",
-        "Use polling to wait for draft model work (default: same as --poll])",
+        "Use polling to wait for draft model work (default: same as --poll)",
         [](common_params & params, int value) {
             params.speculative.draft.cpuparams.poll = value;
         }
diff --git a/docs/speculative.md b/docs/speculative.md
index 29da332875f0..fb6ef03067d3 100644
--- a/docs/speculative.md
+++ b/docs/speculative.md
@@ -33,18 +33,18 @@ An example to use this approach can be the rewriting of source code by a LLM.
 This implementation looks for the last n-gram in history that matches the current n-gram and creates a draft using the m tokens following the matched n-gram. It is the simplest self-speculative approach with minimal overhead.
 
 ```
-llama-server [...] --spec-type ngram-simple --draft-max 64
+llama-server [...] --spec-type ngram-simple --spec-draft-n-max 64
 ```
 
 #### n-gram Map Key (`ngram-map-k`)
 
-This implementation looks for the current n-gram of size n (called the _key_) in the token history. If the key n-gram is followed by the same m tokens (called the _mgram_) multiple times, it creates a draft using these m tokens. This approach requires a minimum number of occurrences (argument `--spec-ngram-min-hits`, default is 1) before generating drafts.
+This implementation looks for the current n-gram of size n (called the _key_) in the token history. If the key n-gram is followed by the same m tokens (called the _mgram_) multiple times, it creates a draft using these m tokens. This approach requires a minimum number of occurrences (argument `--spec-ngram-map-k-min-hits`, default is 1) before generating drafts.
 
 The number of accepted tokens is stored for each used n-gram.
 
 **Example:**
 ```
-llama-server [...] --spec-type ngram-map-k --draft-max 64
+llama-server [...] --spec-type ngram-map-k --spec-draft-n-max 64
 ```
 
 #### n-gram Map Key-4-Values (`ngram-map-k4v`)
@@ -55,7 +55,7 @@ The number of accepted tokens is stored for each used n-gram.
 
 **Example:** Server options to be used if there are a lot of longer repetitions.
 ```
-llama-server [...] --spec-type ngram-map-k4v --spec-ngram-size-n 8 --spec-ngram-size-m 8 --spec-ngram-min-hits 2 --draft-max 64
+llama-server [...] --spec-type ngram-map-k4v --spec-ngram-map-k4v-size-n 8 --spec-ngram-map-k4v-size-m 8 --spec-ngram-map-k4v-min-hits 2 --spec-draft-n-max 64
 ```
 
 ### n-gram Mod (`ngram-mod`)
@@ -80,9 +80,9 @@ Currently, a single hash pool is shared across all server slots, so different re
 # notes:
 # - small `n` are not recommended
 # - MoEs require long drafts
-# - dense models: can reduce `--draft-min` and `--draft-max`
+# - dense models: can reduce `--spec-ngram-mod-n-min` and `--spec-ngram-mod-n-max`
 
-llama-server ... --spec-type ngram-mod --spec-ngram-size-n 24 --draft-min 48 --draft-max 64
+llama-server ... --spec-type ngram-mod --spec-ngram-mod-n-match 24 --spec-ngram-mod-n-min 48 --spec-ngram-mod-n-max 64
 ```
 
 Applications:
@@ -105,21 +105,90 @@ Example Video:
 
 If a draft model is combined with a draftless decoding the draftless decoding has higher precedence.
 
+### General Speculative Parameters
+
 ```
---draft, --draft-n, --draft-max N       number of tokens to draft for speculative decoding (default: 16)
-                                        (env: LLAMA_ARG_DRAFT_MAX)
---draft-min, --draft-n-min N            minimum number of draft tokens to use for speculative decoding
-                                        (default: 0)
-                                        (env: LLAMA_ARG_DRAFT_MIN)
-[...]
 --spec-type [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
                                         type of speculative decoding to use when no draft model is provided
                                         (default: none)
---spec-ngram-size-n N                   ngram size N for ngram-simple/ngram-map speculative decoding, length
-                                        of lookup n-gram (default: 12)
---spec-ngram-size-m N                   ngram size M for ngram-simple/ngram-map speculative decoding, length
-                                        of draft m-gram (default: 48)
---spec-ngram-min-hits N                 minimum hits for ngram-map speculative decoding (default: 1)
+                                        (env: LLAMA_ARG_SPEC_TYPE)
+--spec-default                          use default speculative decoding
+```
+
+### Draft Model Parameters
+
+```
+--spec-draft-model, -md, --model-draft  FNAME
+                                        draft model for speculative decoding (default: unused)
+                                        (env: LLAMA_ARG_SPEC_DRAFT_MODEL)
+--spec-draft-hf, -hfd, -hfrd, --hf-repo-draft  <user>/<model>[:quant]
+                                        HuggingFace repository for the draft model
+--spec-draft-n-max                      N
+                                        number of tokens to draft for speculative decoding (default: 16)
+                                        (env: LLAMA_ARG_SPEC_DRAFT_N_MAX)
+--spec-draft-n-min                      N
+                                        minimum number of draft tokens to use for speculative decoding (default: 0)
+                                        (env: LLAMA_ARG_SPEC_DRAFT_N_MIN)
+--spec-draft-p-split, --draft-p-split   P
+                                        speculative decoding split probability (default: 0.10)
+                                        (env: LLAMA_ARG_SPEC_DRAFT_P_SPLIT)
+--spec-draft-p-min, --draft-p-min       P
+                                        minimum speculative decoding probability (greedy) (default: 0.75)
+                                        (env: LLAMA_ARG_SPEC_DRAFT_P_MIN)
+--spec-draft-ctx-size, -cd, --ctx-size-draft  N
+                                        size of the prompt context for the draft model (default: 0, 0 = loaded from model)
+                                        (env: LLAMA_ARG_SPEC_DRAFT_CTX_SIZE)
+--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft  N
+                                        max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
+                                        (env: LLAMA_ARG_N_GPU_LAYERS_DRAFT)
+--spec-draft-device, -devd, --device-draft  <dev1,dev2,..>
+                                        comma-separated list of devices to use for offloading the draft model
+--spec-draft-replace, --spec-replace    TARGET  DRAFT
+                                        translate the string in TARGET into DRAFT if the draft model and main model are not compatible
+```
+
+### n-gram Mod Parameters
+
+```
+--spec-ngram-mod-n-match                N
+                                        ngram-mod lookup length (default: 24)
+--spec-ngram-mod-n-min                  N
+                                        minimum number of ngram tokens to use for ngram-based speculative decoding (default: 48)
+--spec-ngram-mod-n-max                  N
+                                        maximum number of ngram tokens to use for ngram-based speculative decoding (default: 64)
+```
+
+### n-gram Simple Parameters
+
+```
+--spec-ngram-simple-size-n              N
+                                        ngram size N for ngram-simple speculative decoding, length of lookup n-gram (default: 12)
+--spec-ngram-simple-size-m              N
+                                        ngram size M for ngram-simple speculative decoding, length of draft m-gram (default: 48)
+--spec-ngram-simple-min-hits            N
+                                        minimum hits for ngram-simple speculative decoding (default: 1)
+```
+
+### n-gram Map Key Parameters
+
+```
+--spec-ngram-map-k-size-n               N
+                                        ngram size N for ngram-map-k speculative decoding, length of lookup n-gram (default: 12)
+--spec-ngram-map-k-size-m               N
+                                        ngram size M for ngram-map-k speculative decoding, length of draft m-gram (default: 48)
+--spec-ngram-map-k-min-hits             N
+                                        minimum hits for ngram-map-k speculative decoding (default: 1)
+```
+
+### n-gram Map Key-4-Values Parameters
+
+```
+--spec-ngram-map-k4v-size-n             N
+                                        ngram size N for ngram-map-k4v speculative decoding, length of lookup n-gram (default: 12)
+--spec-ngram-map-k4v-size-m             N
+                                        ngram size M for ngram-map-k4v speculative decoding, length of draft m-gram (default: 48)
+--spec-ngram-map-k4v-min-hits           N
+                                        minimum hits for ngram-map-k4v speculative decoding (default: 1)
 ```
 
 ### `--spec-type TYPE`
@@ -140,21 +209,40 @@ Specifies a type of speculative decoding without draft model.
 ./llama-server [...] --spec-type ngram-simple
 ```
 
-### `--spec-ngram-size-n N`
+### `--spec-ngram-*-size-n N`
 
 Sets the size N of the lookup n-gram for n-gram map based speculative decoding.
 The n-gram size N determines how many tokens in a row to look back when searching for matching patterns.
 
-### `--spec-ngram-size-m M`
+Each n-gram implementation has its own parameter:
+
+- `--spec-ngram-simple-size-n` for `ngram-simple`
+- `--spec-ngram-map-k-size-n` for `ngram-map-k`
+- `--spec-ngram-map-k4v-size-n` for `ngram-map-k4v`
+- `--spec-ngram-mod-n-match` for `ngram-mod`
+
+### `--spec-ngram-*-size-m M`
 
 Sets the size M of the draft m-gram for n-gram map based speculative decoding.
 The m-gram size determines how many tokens to draft when a match is found.
 Larger values can provide more speedup but may reduce acceptance rate.
 
-### `--spec-ngram-min-hits H`
+Each n-gram implementation has its own parameter:
+
+- `--spec-ngram-simple-size-m` for `ngram-simple`
+- `--spec-ngram-map-k-size-m` for `ngram-map-k`
+- `--spec-ngram-map-k4v-size-m` for `ngram-map-k4v`
+
+### `--spec-ngram-*-min-hits H`
 
 This option defines how often a key has to appear in the token history to be used as a draft (default is 1).
 
+Each n-gram implementation has its own parameter:
+
+- `--spec-ngram-simple-min-hits` for `ngram-simple`
+- `--spec-ngram-map-k-min-hits` for `ngram-map-k`
+- `--spec-ngram-map-k4v-min-hits` for `ngram-map-k4v`
+
 ## Statistics
 Each speculative decoding implementation prints statistics.
 
@@ -180,4 +268,3 @@ statistics ngram_map_k: #calls(b,g,a) = 6 1690 26, #gen drafts = 26, #acc drafts
 - `#gen tokens`: number of tokens generated by this implementation (including rejected tokens)
 - `#acc tokens`: number of tokens accepted by the main model
 - `dur(b,g,a): durations of begin (new prompt), generation and accumulation (process acceptance).
-
diff --git a/tools/cli/README.md b/tools/cli/README.md
index de0b7804091a..bca4da7efbb7 100644
--- a/tools/cli/README.md
+++ b/tools/cli/README.md
@@ -15,7 +15,6 @@
 | `--license` | show source code license and dependencies |
 | `-cl, --cache-list` | show list of models in cache |
 | `--completion-bash` | print source-able bash completion script for llama.cpp |
-| `--verbose-prompt` | print a verbose prompt before generation (default: false) |
 | `-t, --threads N` | number of CPU threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
 | `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
 | `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
@@ -66,7 +65,7 @@
 | `-cmoe, --cpu-moe` | keep all Mixture of Experts (MoE) weights in the CPU<br/>(env: LLAMA_ARG_CPU_MOE) |
 | `-ncmoe, --n-cpu-moe N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU<br/>(env: LLAMA_ARG_N_CPU_MOE) |
 | `-ngl, --gpu-layers, --n-gpu-layers N` | max. number of layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
-| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs<br/>(env: LLAMA_ARG_SPLIT_MODE) |
+| `-sm, --split-mode {none,layer,row,tensor}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs (pipelined)<br/>- row: split weight across GPUs by rows (parallelized)<br/>- tensor: split weights and KV across GPUs (parallelized, EXPERIMENTAL)<br/>(env: LLAMA_ARG_SPLIT_MODE) |
 | `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1<br/>(env: LLAMA_ARG_TENSOR_SPLIT) |
 | `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
 | `-fit, --fit [on\|off]` | whether to adjust unset arguments to fit in device memory ('on' or 'off', default: 'on')<br/>(env: LLAMA_ARG_FIT) |
@@ -84,7 +83,6 @@
 | `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
 | `-dr, --docker-repo [<repo>/]<model>[:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.<br/>example: gemma3<br/>(default: unused)<br/>(env: LLAMA_ARG_DOCKER_REPO) |
 | `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
-| `-hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_HFD_REPO) |
 | `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
 | `-hfv, -hfrv, --hf-repo-v <user>/<model>[:quant]` | Hugging Face model repository for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_REPO_V) |
 | `-hffv, --hf-file-v FILE` | Hugging Face model file for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_FILE_V) |
@@ -97,8 +95,8 @@
 | `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:<br/> - 0: generic output<br/> - 1: error<br/> - 2: warning<br/> - 3: info<br/> - 4: debug<br/>(default: 3)<br/><br/>(env: LLAMA_LOG_VERBOSITY) |
 | `--log-prefix` | Enable prefix in log messages<br/>(env: LLAMA_LOG_PREFIX) |
 | `--log-timestamps` | Enable timestamps in log messages<br/>(env: LLAMA_LOG_TIMESTAMPS) |
-| `-ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K_DRAFT) |
-| `-ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V_DRAFT) |
+| `--spec-draft-type-k, -ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_K) |
+| `--spec-draft-type-v, -ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_V) |
 
 
 ### Sampling params
@@ -145,6 +143,7 @@
 
 | Argument | Explanation |
 | -------- | ----------- |
+| `--verbose-prompt` | print a verbose prompt before generation (default: false) |
 | `--display-prompt, --no-display-prompt` | whether to print prompt at generation (default: true) |
 | `-co, --color [on\|off\|auto]` | Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')<br/>'auto' enables colors when output is to a terminal |
 | `-ctxcp, --ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 32)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_CTX_CHECKPOINTS) |
@@ -167,30 +166,59 @@
 | `--image, --audio FILE` | path to an image or audio file. use with multimodal models, use comma-separated values for multiple files |
 | `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
 | `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
-| `-otd, --override-tensor-draft <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
-| `-cmoed, --cpu-moe-draft` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model<br/>(env: LLAMA_ARG_CPU_MOE_DRAFT) |
-| `-ncmoed, --n-cpu-moe-draft N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model<br/>(env: LLAMA_ARG_N_CPU_MOE_DRAFT) |
 | `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'<br/>(env: LLAMA_CHAT_TEMPLATE_KWARGS) |
 | `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)<br/>(env: LLAMA_ARG_JINJA) |
 | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
 | `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))<br/>(env: LLAMA_ARG_REASONING) |
 | `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
 | `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)<br/>(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) |
-| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
-| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
+| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-ocr, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
+| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-ocr, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
 | `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)<br/>(env: LLAMA_ARG_SKIP_CHAT_PARSING) |
 | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
-| `--draft, --draft-n, --draft-max N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) |
-| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)<br/>(env: LLAMA_ARG_DRAFT_MIN) |
-| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.75)<br/>(env: LLAMA_ARG_DRAFT_P_MIN) |
-| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE_DRAFT) |
-| `-devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
-| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
-| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_MODEL_DRAFT) |
-| `--spec-replace TARGET DRAFT` | translate the string in TARGET into DRAFT if the draft model and main model are not compatible |
+| `--spec-draft-hf, -hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_SPEC_DRAFT_HF_REPO) |
+| `--spec-draft-threads, -td, --threads-draft N` | number of threads to use during generation (default: same as --threads) |
+| `--spec-draft-threads-batch, -tbd, --threads-batch-draft N` | number of threads to use during batch and prompt processing (default: same as --threads-draft) |
+| `--spec-draft-cpu-mask, -Cd, --cpu-mask-draft M` | Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask) |
+| `--spec-draft-cpu-range, -Crd, --cpu-range-draft lo-hi` | Ranges of CPUs for affinity. Complements --cpu-mask-draft |
+| `--spec-draft-cpu-strict, --cpu-strict-draft <0\|1>` | Use strict CPU placement for draft model (default: same as --cpu-strict) |
+| `--spec-draft-prio, --prio-draft N` | set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0) |
+| `--spec-draft-poll, --poll-draft <0\|1>` | Use polling to wait for draft model work (default: same as --poll) |
+| `--spec-draft-cpu-mask-batch, -Cbd, --cpu-mask-batch-draft M` | Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask) |
+| `--spec-draft-cpu-strict-batch, --cpu-strict-batch-draft <0\|1>` | Use strict CPU placement for draft model (default: --cpu-strict-draft) |
+| `--spec-draft-prio-batch, --prio-batch-draft N` | set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0) |
+| `--spec-draft-poll-batch, --poll-batch-draft <0\|1>` | Use polling to wait for draft model work (default: --poll-draft) |
+| `--spec-draft-override-tensor, -otd, --override-tensor-draft <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
+| `--spec-draft-cpu-moe, -cmoed, --cpu-moe-draft` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model<br/>(env: LLAMA_ARG_SPEC_DRAFT_CPU_MOE) |
+| `--spec-draft-n-cpu-moe, --spec-draft-ncmoe, -ncmoed, --n-cpu-moe-draft N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model<br/>(env: LLAMA_ARG_SPEC_DRAFT_N_CPU_MOE) |
+| `--spec-draft-n-max N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_SPEC_DRAFT_N_MAX) |
+| `--spec-draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)<br/>(env: LLAMA_ARG_SPEC_DRAFT_N_MIN) |
+| `--spec-draft-p-split, --draft-p-split P` | speculative decoding split probability (default: 0.10)<br/>(env: LLAMA_ARG_SPEC_DRAFT_P_SPLIT) |
+| `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.75)<br/>(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) |
+| `--spec-draft-ctx-size, -cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_SPEC_DRAFT_CTX_SIZE) |
+| `--spec-draft-device, -devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
+| `--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
+| `--spec-draft-model, -md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_SPEC_DRAFT_MODEL) |
+| `--spec-draft-replace, --spec-replace TARGET DRAFT` | translate the string in TARGET into DRAFT if the draft model and main model are not compatible |
+| `--spec-type [none\|ngram-cache\|ngram-simple\|ngram-map-k\|ngram-map-k4v\|ngram-mod]` | type of speculative decoding to use when no draft model is provided (default: none)<br/><br/>(env: LLAMA_ARG_SPEC_TYPE) |
+| `--spec-ngram-mod-n-min N` | minimum number of ngram tokens to use for ngram-based speculative decoding (default: 48) |
+| `--spec-ngram-mod-n-max N` | maximum number of ngram tokens to use for ngram-based speculative decoding (default: 64) |
+| `--spec-ngram-mod-n-match N` | ngram-mod lookup length (default: 24) |
+| `--spec-ngram-simple-size-n N` | ngram size N for ngram-simple speculative decoding, length of lookup n-gram (default: 12) |
+| `--spec-ngram-simple-size-m N` | ngram size M for ngram-simple speculative decoding, length of draft m-gram (default: 48) |
+| `--spec-ngram-simple-min-hits N` | minimum hits for ngram-simple speculative decoding (default: 1) |
+| `--spec-ngram-map-k-size-n N` | ngram size N for ngram-map-k speculative decoding, length of lookup n-gram (default: 12) |
+| `--spec-ngram-map-k-size-m N` | ngram size M for ngram-map-k speculative decoding, length of draft m-gram (default: 48) |
+| `--spec-ngram-map-k-min-hits N` | minimum hits for ngram-map-k speculative decoding (default: 1) |
+| `--spec-ngram-map-k4v-size-n N` | ngram size N for ngram-map-k4v speculative decoding, length of lookup n-gram (default: 12) |
+| `--spec-ngram-map-k4v-size-m N` | ngram size M for ngram-map-k4v speculative decoding, length of draft m-gram (default: 48) |
+| `--spec-ngram-map-k4v-min-hits N` | minimum hits for ngram-map-k4v speculative decoding (default: 1) |
+| `--draft, --draft-n, --draft-max N` | the argument has been removed. use --spec-draft-n-max or --spec-ngram-mod-n-max<br/>(env: LLAMA_ARG_DRAFT_MAX) |
+| `--draft-min, --draft-n-min N` | the argument has been removed. use --spec-draft-n-min or --spec-ngram-mod-n-min<br/>(env: LLAMA_ARG_DRAFT_MIN) |
 | `--gpt-oss-20b-default` | use gpt-oss-20b (note: can download weights from the internet) |
 | `--gpt-oss-120b-default` | use gpt-oss-120b (note: can download weights from the internet) |
 | `--vision-gemma-4b-default` | use Gemma 3 4B QAT (note: can download weights from the internet) |
 | `--vision-gemma-12b-default` | use Gemma 3 12B QAT (note: can download weights from the internet) |
+| `--spec-default` | enable default speculative decoding config |
 
 <!-- HELP_END -->
diff --git a/tools/completion/README.md b/tools/completion/README.md
index fe1a036a38c6..7042889db13b 100644
--- a/tools/completion/README.md
+++ b/tools/completion/README.md
@@ -98,7 +98,6 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
 | `--license` | show source code license and dependencies |
 | `-cl, --cache-list` | show list of models in cache |
 | `--completion-bash` | print source-able bash completion script for llama.cpp |
-| `--verbose-prompt` | print a verbose prompt before generation (default: false) |
 | `-t, --threads N` | number of CPU threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
 | `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
 | `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
@@ -149,7 +148,7 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
 | `-cmoe, --cpu-moe` | keep all Mixture of Experts (MoE) weights in the CPU<br/>(env: LLAMA_ARG_CPU_MOE) |
 | `-ncmoe, --n-cpu-moe N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU<br/>(env: LLAMA_ARG_N_CPU_MOE) |
 | `-ngl, --gpu-layers, --n-gpu-layers N` | max. number of layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
-| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs<br/>(env: LLAMA_ARG_SPLIT_MODE) |
+| `-sm, --split-mode {none,layer,row,tensor}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs (pipelined)<br/>- row: split weight across GPUs by rows (parallelized)<br/>- tensor: split weights and KV across GPUs (parallelized, EXPERIMENTAL)<br/>(env: LLAMA_ARG_SPLIT_MODE) |
 | `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1<br/>(env: LLAMA_ARG_TENSOR_SPLIT) |
 | `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
 | `-fit, --fit [on\|off]` | whether to adjust unset arguments to fit in device memory ('on' or 'off', default: 'on')<br/>(env: LLAMA_ARG_FIT) |
@@ -167,7 +166,6 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
 | `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
 | `-dr, --docker-repo [<repo>/]<model>[:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.<br/>example: gemma3<br/>(default: unused)<br/>(env: LLAMA_ARG_DOCKER_REPO) |
 | `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
-| `-hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_HFD_REPO) |
 | `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
 | `-hfv, -hfrv, --hf-repo-v <user>/<model>[:quant]` | Hugging Face model repository for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_REPO_V) |
 | `-hffv, --hf-file-v FILE` | Hugging Face model file for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_FILE_V) |
@@ -180,8 +178,8 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
 | `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:<br/> - 0: generic output<br/> - 1: error<br/> - 2: warning<br/> - 3: info<br/> - 4: debug<br/>(default: 3)<br/><br/>(env: LLAMA_LOG_VERBOSITY) |
 | `--log-prefix` | Enable prefix in log messages<br/>(env: LLAMA_LOG_PREFIX) |
 | `--log-timestamps` | Enable timestamps in log messages<br/>(env: LLAMA_LOG_TIMESTAMPS) |
-| `-ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K_DRAFT) |
-| `-ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V_DRAFT) |
+| `--spec-draft-type-k, -ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_K) |
+| `--spec-draft-type-v, -ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_V) |
 
 
 ### Sampling params
@@ -228,6 +226,7 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
 
 | Argument | Explanation |
 | -------- | ----------- |
+| `--verbose-prompt` | print a verbose prompt before generation (default: false) |
 | `--display-prompt, --no-display-prompt` | whether to print prompt at generation (default: true) |
 | `-co, --color [on\|off\|auto]` | Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')<br/>'auto' enables colors when output is to a terminal |
 | `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
@@ -255,8 +254,8 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
 | `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))<br/>(env: LLAMA_ARG_REASONING) |
 | `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
 | `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)<br/>(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) |
-| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
-| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
+| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-ocr, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
+| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-ocr, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
 | `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)<br/>(env: LLAMA_ARG_SKIP_CHAT_PARSING) |
 | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
 
diff --git a/tools/server/README.md b/tools/server/README.md
index db1f27039041..eda875951322 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -82,7 +82,7 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `-cmoe, --cpu-moe` | keep all Mixture of Experts (MoE) weights in the CPU<br/>(env: LLAMA_ARG_CPU_MOE) |
 | `-ncmoe, --n-cpu-moe N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU<br/>(env: LLAMA_ARG_N_CPU_MOE) |
 | `-ngl, --gpu-layers, --n-gpu-layers N` | max. number of layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
-| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs<br/>(env: LLAMA_ARG_SPLIT_MODE) |
+| `-sm, --split-mode {none,layer,row,tensor}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs (pipelined)<br/>- row: split weight across GPUs by rows (parallelized)<br/>- tensor: split weights and KV across GPUs (parallelized, EXPERIMENTAL)<br/>(env: LLAMA_ARG_SPLIT_MODE) |
 | `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1<br/>(env: LLAMA_ARG_TENSOR_SPLIT) |
 | `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
 | `-fit, --fit [on\|off]` | whether to adjust unset arguments to fit in device memory ('on' or 'off', default: 'on')<br/>(env: LLAMA_ARG_FIT) |
@@ -100,7 +100,6 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
 | `-dr, --docker-repo [<repo>/]<model>[:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.<br/>example: gemma3<br/>(default: unused)<br/>(env: LLAMA_ARG_DOCKER_REPO) |
 | `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
-| `-hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_HFD_REPO) |
 | `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
 | `-hfv, -hfrv, --hf-repo-v <user>/<model>[:quant]` | Hugging Face model repository for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_REPO_V) |
 | `-hffv, --hf-file-v FILE` | Hugging Face model file for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_FILE_V) |
@@ -113,8 +112,8 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:<br/> - 0: generic output<br/> - 1: error<br/> - 2: warning<br/> - 3: info<br/> - 4: debug<br/>(default: 3)<br/><br/>(env: LLAMA_LOG_VERBOSITY) |
 | `--log-prefix` | Enable prefix in log messages<br/>(env: LLAMA_LOG_PREFIX) |
 | `--log-timestamps` | Enable timestamps in log messages<br/>(env: LLAMA_LOG_TIMESTAMPS) |
-| `-ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K_DRAFT) |
-| `-ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V_DRAFT) |
+| `--spec-draft-type-k, -ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_K) |
+| `--spec-draft-type-v, -ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_V) |
 
 
 ### Sampling params
@@ -182,9 +181,6 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_OFFLOAD) |
 | `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
 | `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
-| `-otd, --override-tensor-draft <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
-| `-cmoed, --cpu-moe-draft` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model<br/>(env: LLAMA_ARG_CPU_MOE_DRAFT) |
-| `-ncmoed, --n-cpu-moe-draft N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model<br/>(env: LLAMA_ARG_N_CPU_MOE_DRAFT) |
 | `-a, --alias STRING` | set model name aliases, comma-separated (to be used by API)<br/>(env: LLAMA_ARG_ALIAS) |
 | `--tags STRING` | set model tags, comma-separated (informational, not used for routing)<br/>(env: LLAMA_ARG_TAGS) |
 | `--host HOST` | ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
@@ -222,27 +218,55 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))<br/>(env: LLAMA_ARG_REASONING) |
 | `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
 | `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)<br/>(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) |
-| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
-| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
+| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-ocr, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
+| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-ocr, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
 | `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)<br/>(env: LLAMA_ARG_SKIP_CHAT_PARSING) |
 | `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_PREFILL_ASSISTANT) |
 | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled) |
 | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
 | `--sleep-idle-seconds SECONDS` | number of seconds of idleness after which the server will sleep (default: -1; -1 = disabled) |
-| `-td, --threads-draft N` | number of threads to use during generation (default: same as --threads) |
-| `-tbd, --threads-batch-draft N` | number of threads to use during batch and prompt processing (default: same as --threads-draft) |
-| `--draft, --draft-n, --draft-max N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) |
-| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)<br/>(env: LLAMA_ARG_DRAFT_MIN) |
-| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.75)<br/>(env: LLAMA_ARG_DRAFT_P_MIN) |
-| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE_DRAFT) |
-| `-devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
-| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
-| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_MODEL_DRAFT) |
-| `--spec-replace TARGET DRAFT` | translate the string in TARGET into DRAFT if the draft model and main model are not compatible |
+| `--spec-draft-hf, -hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_SPEC_DRAFT_HF_REPO) |
+| `--spec-draft-threads, -td, --threads-draft N` | number of threads to use during generation (default: same as --threads) |
+| `--spec-draft-threads-batch, -tbd, --threads-batch-draft N` | number of threads to use during batch and prompt processing (default: same as --threads-draft) |
+| `--spec-draft-cpu-mask, -Cd, --cpu-mask-draft M` | Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask) |
+| `--spec-draft-cpu-range, -Crd, --cpu-range-draft lo-hi` | Ranges of CPUs for affinity. Complements --cpu-mask-draft |
+| `--spec-draft-cpu-strict, --cpu-strict-draft <0\|1>` | Use strict CPU placement for draft model (default: same as --cpu-strict) |
+| `--spec-draft-prio, --prio-draft N` | set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0) |
+| `--spec-draft-poll, --poll-draft <0\|1>` | Use polling to wait for draft model work (default: same as --poll) |
+| `--spec-draft-cpu-mask-batch, -Cbd, --cpu-mask-batch-draft M` | Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask) |
+| `--spec-draft-cpu-strict-batch, --cpu-strict-batch-draft <0\|1>` | Use strict CPU placement for draft model (default: --cpu-strict-draft) |
+| `--spec-draft-prio-batch, --prio-batch-draft N` | set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0) |
+| `--spec-draft-poll-batch, --poll-batch-draft <0\|1>` | Use polling to wait for draft model work (default: --poll-draft) |
+| `--spec-draft-override-tensor, -otd, --override-tensor-draft <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
+| `--spec-draft-cpu-moe, -cmoed, --cpu-moe-draft` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model<br/>(env: LLAMA_ARG_SPEC_DRAFT_CPU_MOE) |
+| `--spec-draft-n-cpu-moe, --spec-draft-ncmoe, -ncmoed, --n-cpu-moe-draft N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model<br/>(env: LLAMA_ARG_SPEC_DRAFT_N_CPU_MOE) |
+| `--spec-draft-n-max N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_SPEC_DRAFT_N_MAX) |
+| `--spec-draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)<br/>(env: LLAMA_ARG_SPEC_DRAFT_N_MIN) |
+| `--spec-draft-p-split, --draft-p-split P` | speculative decoding split probability (default: 0.10)<br/>(env: LLAMA_ARG_SPEC_DRAFT_P_SPLIT) |
+| `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.75)<br/>(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) |
+| `--spec-draft-ctx-size, -cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_SPEC_DRAFT_CTX_SIZE) |
+| `--spec-draft-device, -devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
+| `--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
+| `--spec-draft-model, -md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_SPEC_DRAFT_MODEL) |
+| `--spec-draft-replace, --spec-replace TARGET DRAFT` | translate the string in TARGET into DRAFT if the draft model and main model are not compatible |
 | `--spec-type [none\|ngram-cache\|ngram-simple\|ngram-map-k\|ngram-map-k4v\|ngram-mod]` | type of speculative decoding to use when no draft model is provided (default: none)<br/><br/>(env: LLAMA_ARG_SPEC_TYPE) |
-| `--spec-ngram-size-n N` | ngram size N for ngram-simple/ngram-map speculative decoding, length of lookup n-gram (default: 12) |
-| `--spec-ngram-size-m N` | ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram (default: 48) |
-| `--spec-ngram-min-hits N` | minimum hits for ngram-map speculative decoding (default: 1) |
+| `--spec-ngram-mod-n-min N` | minimum number of ngram tokens to use for ngram-based speculative decoding (default: 48) |
+| `--spec-ngram-mod-n-max N` | maximum number of ngram tokens to use for ngram-based speculative decoding (default: 64) |
+| `--spec-ngram-mod-n-match N` | ngram-mod lookup length (default: 24) |
+| `--spec-ngram-simple-size-n N` | ngram size N for ngram-simple speculative decoding, length of lookup n-gram (default: 12) |
+| `--spec-ngram-simple-size-m N` | ngram size M for ngram-simple speculative decoding, length of draft m-gram (default: 48) |
+| `--spec-ngram-simple-min-hits N` | minimum hits for ngram-simple speculative decoding (default: 1) |
+| `--spec-ngram-map-k-size-n N` | ngram size N for ngram-map-k speculative decoding, length of lookup n-gram (default: 12) |
+| `--spec-ngram-map-k-size-m N` | ngram size M for ngram-map-k speculative decoding, length of draft m-gram (default: 48) |
+| `--spec-ngram-map-k-min-hits N` | minimum hits for ngram-map-k speculative decoding (default: 1) |
+| `--spec-ngram-map-k4v-size-n N` | ngram size N for ngram-map-k4v speculative decoding, length of lookup n-gram (default: 12) |
+| `--spec-ngram-map-k4v-size-m N` | ngram size M for ngram-map-k4v speculative decoding, length of draft m-gram (default: 48) |
+| `--spec-ngram-map-k4v-min-hits N` | minimum hits for ngram-map-k4v speculative decoding (default: 1) |
+| `--draft, --draft-n, --draft-max N` | the argument has been removed. use --spec-draft-n-max or --spec-ngram-mod-n-max<br/>(env: LLAMA_ARG_DRAFT_MAX) |
+| `--draft-min, --draft-n-min N` | the argument has been removed. use --spec-draft-n-min or --spec-ngram-mod-n-min<br/>(env: LLAMA_ARG_DRAFT_MIN) |
+| `--spec-ngram-size-n N` | the argument has been removed. use the respective --spec-ngram-*-size-n or --spec-ngram-mod-n-match |
+| `--spec-ngram-size-m N` | the argument has been removed. use the respective --spec-ngram-*-size-m |
+| `--spec-ngram-min-hits N` | the argument has been removed. use the respective --spec-ngram-*-min-hits |
 | `-mv, --model-vocoder FNAME` | vocoder model for audio generation (default: unused) |
 | `--tts-use-guide-tokens` | Use guide tokens to improve TTS word recall |
 | `--embd-gemma-default` | use default EmbeddingGemma model (note: can download weights from the internet) |
@@ -256,6 +280,7 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `--gpt-oss-120b-default` | use gpt-oss-120b (note: can download weights from the internet) |
 | `--vision-gemma-4b-default` | use Gemma 3 4B QAT (note: can download weights from the internet) |
 | `--vision-gemma-12b-default` | use Gemma 3 12B QAT (note: can download weights from the internet) |
+| `--spec-default` | enable default speculative decoding config |
 
 <!-- HELP_END -->
 

From fa8feaed34ee89f246bc46e3f487a97fdd6e9ac7 Mon Sep 17 00:00:00 2001
From: Nick Towle <ntowle@gmail.com>
Date: Mon, 4 May 2026 00:04:07 -0700
Subject: [PATCH 05/13] webui: restore missing settings (#22666)

---
 tools/server/public/bundle.js                 | 372 +++++++++---------
 .../src/lib/constants/settings-sections.ts    |  15 +
 2 files changed, 202 insertions(+), 185 deletions(-)

diff --git a/tools/server/public/bundle.js b/tools/server/public/bundle.js
index cb792c619792..03c1feda2b04 100644
--- a/tools/server/public/bundle.js
+++ b/tools/server/public/bundle.js
@@ -810,77 +810,78 @@ return getSideAndAlignFromPlacement(placement)[1]}function Floating_layer($$anch
 getWindow}),this.handleTypeaheadSearch=this.handleTypeaheadSearch.bind(this),this.resetTypeahead=this.resetTypeahead.bind(this)}handleTypeaheadSearch(key2){if(!this.#opts.enabled()||!get$3(this.#candidateValues).length)return;this.#search.current=this.#search.current+key2;const currentItem=this.#opts.getCurrentItem(),currentMatch=get$3(this.#candidateValues).find(item=>item===currentItem)??"",values=get$3(this.#candidateValues).map(item=>item??""),nextMatch=getNextMatch(values,this.#search.current,
 currentMatch),newItem=get$3(this.#candidateValues).find(item=>item===nextMatch);return newItem&&this.#opts.onMatch(newItem),newItem}resetTypeahead(){this.#search.current=""}}const FIRST_KEYS=[ARROW_DOWN,PAGE_UP,HOME],LAST_KEYS=[ARROW_UP,PAGE_DOWN,END],FIRST_LAST_KEYS=[...FIRST_KEYS,...LAST_KEYS],selectAttrs=createBitsAttrs({component:"select",parts:["trigger","content","item","viewport","scroll-up-button","scroll-down-button","group","group-label","separator","arrow","input","content-wrapper","i\
 tem-text","value"]}),SelectRootContext=new Context$1("Select.Root | Combobox.Root"),SelectContentContext=new Context$1("Select.Content | Combobox.Content");class SelectBaseRootState{opts;#touchedInput=state$1(!1);get touchedInput(){return get$3(this.#touchedInput)}set touchedInput(value){set$1(this.#touchedInput,value,!0)}#inputNode=state$1(null);get inputNode(){return get$3(this.#inputNode)}set inputNode(value){set$1(this.#inputNode,value,!0)}#contentNode=state$1(null);get contentNode(){return get$3(
-this.#contentNode)}set contentNode(value){set$1(this.#contentNode,value,!0)}contentPresence;#viewportNode=state$1(null);get viewportNode(){return get$3(this.#viewportNode)}set viewportNode(value){set$1(this.#viewportNode,value,!0)}#triggerNode=state$1(null);get triggerNode(){return get$3(this.#triggerNode)}set triggerNode(value){set$1(this.#triggerNode,value,!0)}#valueId=state$1("");get valueId(){return get$3(this.#valueId)}set valueId(value){set$1(this.#valueId,value,!0)}#highlightedNode=state$1(
-null);get highlightedNode(){return get$3(this.#highlightedNode)}set highlightedNode(value){set$1(this.#highlightedNode,value,!0)}#highlightedValue=user_derived(()=>this.highlightedNode?this.highlightedNode.getAttribute("data-value"):null);get highlightedValue(){return get$3(this.#highlightedValue)}set highlightedValue(value){set$1(this.#highlightedValue,value)}#highlightedId=user_derived(()=>{if(this.highlightedNode)return this.highlightedNode.id});get highlightedId(){return get$3(this.#highlightedId)}set highlightedId(value){
-set$1(this.#highlightedId,value)}#highlightedLabel=user_derived(()=>this.highlightedNode?this.highlightedNode.getAttribute("data-label"):null);get highlightedLabel(){return get$3(this.#highlightedLabel)}set highlightedLabel(value){set$1(this.#highlightedLabel,value)}#contentIsPositioned=state$1(!1);get contentIsPositioned(){return get$3(this.#contentIsPositioned)}set contentIsPositioned(value){set$1(this.#contentIsPositioned,value,!0)}isUsingKeyboard=!1;isCombobox=!1;domContext=new DOMContext(()=>null);constructor(opts){
-this.opts=opts,this.isCombobox=opts.isCombobox,this.contentPresence=new PresenceManager({ref:boxWith$1(()=>this.contentNode),open:this.opts.open,onComplete:()=>{this.opts.onOpenChangeComplete.current(this.opts.open.current)}}),user_pre_effect(()=>{this.opts.open.current||this.setHighlightedNode(null)})}setHighlightedNode(node2,initial=!1){this.highlightedNode=node2,node2&&(this.isUsingKeyboard||initial)&&this.scrollHighlightedNodeIntoView(node2)}scrollHighlightedNodeIntoView(node2){!this.viewportNode||
-!this.contentIsPositioned||node2.scrollIntoView({block:this.opts.scrollAlignment.current})}getCandidateNodes(){const node2=this.contentNode;return node2?Array.from(node2.querySelectorAll(`[${this.getBitsAttr("item")}]:not([data-disabled])`)):[]}setHighlightedToFirstCandidate(initial=!1){this.setHighlightedNode(null);let nodes2=this.getCandidateNodes();if(nodes2.length){if(this.viewportNode){const viewportRect=this.viewportNode.getBoundingClientRect();nodes2=nodes2.filter(node2=>{if(!this.viewportNode)
-return!1;const nodeRect=node2.getBoundingClientRect();return nodeRect.right<=viewportRect.right&&nodeRect.left>=viewportRect.left&&nodeRect.bottom<=viewportRect.bottom&&nodeRect.top>=viewportRect.top})}this.setHighlightedNode(nodes2[0],initial)}}getNodeByValue(value){return this.getCandidateNodes().find(node2=>node2.dataset.value===value)??null}setOpen(open2){this.opts.open.current=open2}toggleOpen(){this.opts.open.current=!this.opts.open.current}handleOpen(){this.setOpen(!0)}handleClose(){this.
-setHighlightedNode(null),this.setOpen(!1)}toggleMenu(){this.toggleOpen()}getBitsAttr=part=>selectAttrs.getAttr(part,this.isCombobox?"combobox":void 0)}class SelectSingleRootState extends SelectBaseRootState{opts;isMulti=!1;#hasValue=user_derived(()=>this.opts.value.current!=="");get hasValue(){return get$3(this.#hasValue)}set hasValue(value){set$1(this.#hasValue,value)}#currentLabel=user_derived(()=>this.opts.items.current.length?this.opts.items.current.find(item=>item.value===this.opts.value.current)?.
-label??"":"");get currentLabel(){return get$3(this.#currentLabel)}set currentLabel(value){set$1(this.#currentLabel,value)}#candidateLabels=user_derived(()=>this.opts.items.current.length?this.opts.items.current.filter(item=>!item.disabled).map(item=>item.label):[]);get candidateLabels(){return get$3(this.#candidateLabels)}set candidateLabels(value){set$1(this.#candidateLabels,value)}#dataTypeaheadEnabled=user_derived(()=>!(this.isMulti||this.opts.items.current.length===0));get dataTypeaheadEnabled(){
-return get$3(this.#dataTypeaheadEnabled)}set dataTypeaheadEnabled(value){set$1(this.#dataTypeaheadEnabled,value)}constructor(opts){super(opts),this.opts=opts,user_effect(()=>{!this.opts.open.current&&this.highlightedNode&&this.setHighlightedNode(null)}),watch$1(()=>this.opts.open.current,()=>{this.opts.open.current&&this.setInitialHighlightedNode()})}includesItem(itemValue){return this.opts.value.current===itemValue}toggleItem(itemValue,itemLabel=itemValue){const newValue=this.includesItem(itemValue)?
-"":itemValue;this.opts.value.current=newValue,newValue!==""&&(this.opts.inputValue.current=itemLabel)}setInitialHighlightedNode(){afterTick(()=>{if(!(this.highlightedNode&&this.domContext.getDocument().contains(this.highlightedNode))){if(this.opts.value.current!==""){const node2=this.getNodeByValue(this.opts.value.current);if(node2){this.setHighlightedNode(node2,!0);return}}this.setHighlightedToFirstCandidate(!0)}})}}class SelectMultipleRootState extends SelectBaseRootState{opts;isMulti=!0;#hasValue=user_derived(
-()=>this.opts.value.current.length>0);get hasValue(){return get$3(this.#hasValue)}set hasValue(value){set$1(this.#hasValue,value)}constructor(opts){super(opts),this.opts=opts,user_effect(()=>{!this.opts.open.current&&this.highlightedNode&&this.setHighlightedNode(null)}),watch$1(()=>this.opts.open.current,()=>{this.opts.open.current&&this.setInitialHighlightedNode()})}includesItem(itemValue){return this.opts.value.current.includes(itemValue)}toggleItem(itemValue,itemLabel=itemValue){this.includesItem(
-itemValue)?this.opts.value.current=this.opts.value.current.filter(v=>v!==itemValue):this.opts.value.current=[...this.opts.value.current,itemValue],this.opts.inputValue.current=itemLabel}setInitialHighlightedNode(){afterTick(()=>{if(this.domContext&&!(this.highlightedNode&&this.domContext.getDocument().contains(this.highlightedNode))){if(this.opts.value.current.length&&this.opts.value.current[0]!==""){const node2=this.getNodeByValue(this.opts.value.current[0]);if(node2){this.setHighlightedNode(node2,
-!0);return}}this.setHighlightedToFirstCandidate(!0)}})}}class SelectRootState{static create(props){const{type:type2,...rest}=props,rootState=type2==="single"?new SelectSingleRootState(rest):new SelectMultipleRootState(rest);return SelectRootContext.set(rootState)}}class SelectTriggerState{static create(opts){return new SelectTriggerState(opts,SelectRootContext.get())}opts;root;attachment;#domTypeahead;#dataTypeahead;constructor(opts,root2){this.opts=opts,this.root=root2,this.attachment=attachRef(
-opts.ref,v=>this.root.triggerNode=v),this.root.domContext=new DOMContext(opts.ref),this.#domTypeahead=new DOMTypeahead({getCurrentItem:()=>this.root.highlightedNode,onMatch:node2=>{this.root.setHighlightedNode(node2)},getActiveElement:()=>this.root.domContext.getActiveElement(),getWindow:()=>this.root.domContext.getWindow()}),this.#dataTypeahead=new DataTypeahead({getCurrentItem:()=>this.root.isMulti?"":this.root.currentLabel,onMatch:label=>{if(this.root.isMulti||!this.root.opts.items.current)return;
-const matchedItem=this.root.opts.items.current.find(item=>item.label===label);matchedItem&&(this.root.opts.value.current=matchedItem.value)},enabled:()=>!this.root.isMulti&&this.root.dataTypeaheadEnabled,candidateValues:()=>this.root.isMulti?[]:this.root.candidateLabels,getWindow:()=>this.root.domContext.getWindow()}),this.onkeydown=this.onkeydown.bind(this),this.onpointerdown=this.onpointerdown.bind(this),this.onpointerup=this.onpointerup.bind(this),this.onclick=this.onclick.bind(this)}#handleOpen(){
-this.root.opts.open.current=!0,this.#dataTypeahead.resetTypeahead(),this.#domTypeahead.resetTypeahead()}#handlePointerOpen(_){this.#handleOpen()}#handleKeyboardSelection(){const isCurrentSelectedValue=this.root.highlightedValue===this.root.opts.value.current;return!this.root.opts.allowDeselect.current&&isCurrentSelectedValue&&!this.root.isMulti?(this.root.handleClose(),!0):(this.root.highlightedValue!==null&&this.root.toggleItem(this.root.highlightedValue,this.root.highlightedLabel??void 0),!this.
-root.isMulti&&!isCurrentSelectedValue?(this.root.handleClose(),!0):!1)}onkeydown(e){if(this.root.isUsingKeyboard=!0,(e.key===ARROW_UP||e.key===ARROW_DOWN)&&e.preventDefault(),!this.root.opts.open.current){if(e.key===ENTER||e.key===SPACE||e.key===ARROW_DOWN||e.key===ARROW_UP)e.preventDefault(),this.root.handleOpen();else if(!this.root.isMulti&&this.root.dataTypeaheadEnabled){this.#dataTypeahead.handleTypeaheadSearch(e.key);return}if(this.root.hasValue)return;const candidateNodes2=this.root.getCandidateNodes();
-if(!candidateNodes2.length)return;if(e.key===ARROW_DOWN){const firstCandidate=candidateNodes2[0];this.root.setHighlightedNode(firstCandidate)}else if(e.key===ARROW_UP){const lastCandidate=candidateNodes2[candidateNodes2.length-1];this.root.setHighlightedNode(lastCandidate)}return}if(e.key===TAB){this.root.handleClose();return}if((e.key===ENTER||e.key===SPACE&&this.#domTypeahead.search==="")&&!e.isComposing&&(e.preventDefault(),this.#handleKeyboardSelection()))return;if(e.key===ARROW_UP&&e.altKey&&
-this.root.handleClose(),FIRST_LAST_KEYS.includes(e.key)){e.preventDefault();const candidateNodes2=this.root.getCandidateNodes(),currHighlightedNode=this.root.highlightedNode,currIndex=currHighlightedNode?candidateNodes2.indexOf(currHighlightedNode):-1,loop2=this.root.opts.loop.current;let nextItem;if(e.key===ARROW_DOWN?nextItem=next(candidateNodes2,currIndex,loop2):e.key===ARROW_UP?nextItem=prev(candidateNodes2,currIndex,loop2):e.key===PAGE_DOWN?nextItem=forward(candidateNodes2,currIndex,10,loop2):
-e.key===PAGE_UP?nextItem=backward(candidateNodes2,currIndex,10,loop2):e.key===HOME?nextItem=candidateNodes2[0]:e.key===END&&(nextItem=candidateNodes2[candidateNodes2.length-1]),!nextItem)return;this.root.setHighlightedNode(nextItem);return}const isModifierKey=e.ctrlKey||e.altKey||e.metaKey,isCharacterKey=e.key.length===1,isSpaceKey=e.key===SPACE,candidateNodes=this.root.getCandidateNodes();if(e.key!==TAB){if(!isModifierKey&&(isCharacterKey||isSpaceKey)){!this.#domTypeahead.handleTypeaheadSearch(
-e.key,candidateNodes)&&isSpaceKey&&(e.preventDefault(),this.#handleKeyboardSelection());return}this.root.highlightedNode||this.root.setHighlightedToFirstCandidate()}}onclick(e){e.currentTarget.focus()}onpointerdown(e){if(this.root.opts.disabled.current)return;if(e.pointerType==="touch")return e.preventDefault();const target2=e.target;target2?.hasPointerCapture(e.pointerId)&&target2?.releasePointerCapture(e.pointerId),e.button===0&&e.ctrlKey===!1&&(this.root.opts.open.current===!1?this.#handlePointerOpen(
-e):this.root.handleClose())}onpointerup(e){this.root.opts.disabled.current||(e.preventDefault(),e.pointerType==="touch"&&(this.root.opts.open.current===!1?this.#handlePointerOpen(e):this.root.handleClose()))}#props=user_derived(()=>({id:this.opts.id.current,disabled:this.root.opts.disabled.current?!0:void 0,"aria-haspopup":"listbox","aria-expanded":boolToStr(this.root.opts.open.current),"aria-activedescendant":this.root.highlightedId,"data-state":getDataOpenClosed(this.root.opts.open.current),"d\
-ata-disabled":boolToEmptyStrOrUndef(this.root.opts.disabled.current),"data-placeholder":this.root.hasValue?void 0:"",[this.root.getBitsAttr("trigger")]:"",onpointerdown:this.onpointerdown,onkeydown:this.onkeydown,onclick:this.onclick,onpointerup:this.onpointerup,...this.attachment}));get props(){return get$3(this.#props)}set props(value){set$1(this.#props,value)}}class SelectContentState{static create(opts){return SelectContentContext.set(new SelectContentState(opts,SelectRootContext.get()))}opts;root;attachment;#isPositioned=state$1(
-!1);get isPositioned(){return get$3(this.#isPositioned)}set isPositioned(value){set$1(this.#isPositioned,value,!0)}domContext;constructor(opts,root2){this.opts=opts,this.root=root2,this.attachment=attachRef(opts.ref,v=>this.root.contentNode=v),this.domContext=new DOMContext(this.opts.ref),this.root.domContext===null&&(this.root.domContext=this.domContext),onDestroyEffect(()=>{this.root.contentNode=null,this.root.contentIsPositioned=!1,this.isPositioned=!1}),watch$1(()=>this.root.opts.open.current,
-()=>{this.root.opts.open.current||(this.root.contentIsPositioned=!1,this.isPositioned=!1)}),watch$1([()=>this.isPositioned,()=>this.root.highlightedNode],()=>{!this.isPositioned||!this.root.highlightedNode||this.root.scrollHighlightedNodeIntoView(this.root.highlightedNode)}),this.onpointermove=this.onpointermove.bind(this)}onpointermove(_){this.root.isUsingKeyboard=!1}#styles=user_derived(()=>getFloatingContentCSSVars(this.root.isCombobox?"combobox":"select"));onInteractOutside=e=>{if(e.target===
-this.root.triggerNode||e.target===this.root.inputNode){e.preventDefault();return}this.opts.onInteractOutside.current(e),!e.defaultPrevented&&this.root.handleClose()};onEscapeKeydown=e=>{this.opts.onEscapeKeydown.current(e),!e.defaultPrevented&&this.root.handleClose()};onOpenAutoFocus=e=>{e.preventDefault()};onCloseAutoFocus=e=>{e.preventDefault()};get shouldRender(){return this.root.contentPresence.shouldRender}#snippetProps=user_derived(()=>({open:this.root.opts.open.current}));get snippetProps(){
-return get$3(this.#snippetProps)}set snippetProps(value){set$1(this.#snippetProps,value)}#props=user_derived(()=>({id:this.opts.id.current,role:"listbox","aria-multiselectable":this.root.isMulti?"true":void 0,"data-state":getDataOpenClosed(this.root.opts.open.current),...getDataTransitionAttrs(this.root.contentPresence.transitionStatus),[this.root.getBitsAttr("content")]:"",style:{display:"flex",flexDirection:"column",outline:"none",boxSizing:"border-box",pointerEvents:"auto",...get$3(this.#styles)},
-onpointermove:this.onpointermove,...this.attachment}));get props(){return get$3(this.#props)}set props(value){set$1(this.#props,value)}popperProps={onInteractOutside:this.onInteractOutside,onEscapeKeydown:this.onEscapeKeydown,onOpenAutoFocus:this.onOpenAutoFocus,onCloseAutoFocus:this.onCloseAutoFocus,trapFocus:!1,loop:!1,onPlaced:()=>{this.root.opts.open.current&&(this.root.contentIsPositioned=!0,this.isPositioned=!0)}}}class SelectItemState{static create(opts){return new SelectItemState(opts,SelectRootContext.
-get())}opts;root;attachment;#isSelected=user_derived(()=>this.root.includesItem(this.opts.value.current));get isSelected(){return get$3(this.#isSelected)}set isSelected(value){set$1(this.#isSelected,value)}#isHighlighted=user_derived(()=>this.root.highlightedValue===this.opts.value.current);get isHighlighted(){return get$3(this.#isHighlighted)}set isHighlighted(value){set$1(this.#isHighlighted,value)}prevHighlighted=new Previous(()=>this.isHighlighted);#mounted=state$1(!1);get mounted(){return get$3(
-this.#mounted)}set mounted(value){set$1(this.#mounted,value,!0)}constructor(opts,root2){this.opts=opts,this.root=root2,this.attachment=attachRef(opts.ref),watch$1([()=>this.isHighlighted,()=>this.prevHighlighted.current],()=>{this.isHighlighted?this.opts.onHighlight.current():this.prevHighlighted.current&&this.opts.onUnhighlight.current()}),watch$1(()=>this.mounted,()=>{this.mounted&&this.root.setInitialHighlightedNode()}),this.onpointerdown=this.onpointerdown.bind(this),this.onpointerup=this.onpointerup.
-bind(this),this.onpointermove=this.onpointermove.bind(this)}handleSelect(){if(this.opts.disabled.current)return;const isCurrentSelectedValue=this.opts.value.current===this.root.opts.value.current;if(!this.root.opts.allowDeselect.current&&isCurrentSelectedValue&&!this.root.isMulti){this.root.handleClose();return}this.root.toggleItem(this.opts.value.current,this.opts.label.current),!this.root.isMulti&&!isCurrentSelectedValue&&this.root.handleClose()}#snippetProps=user_derived(()=>({selected:this.isSelected,
-highlighted:this.isHighlighted}));get snippetProps(){return get$3(this.#snippetProps)}set snippetProps(value){set$1(this.#snippetProps,value)}onpointerdown(e){e.preventDefault()}onpointerup(e){if(!(e.defaultPrevented||!this.opts.ref.current)){if(e.pointerType==="touch"&&!isIOS){on(this.opts.ref.current,"click",()=>{this.handleSelect(),this.root.setHighlightedNode(this.opts.ref.current)},{once:!0});return}e.preventDefault(),this.handleSelect(),e.pointerType==="touch"&&this.root.setHighlightedNode(
-this.opts.ref.current)}}onpointermove(e){e.pointerType!=="touch"&&this.root.highlightedNode!==this.opts.ref.current&&this.root.setHighlightedNode(this.opts.ref.current)}#props=user_derived(()=>({id:this.opts.id.current,role:"option","aria-selected":this.root.includesItem(this.opts.value.current)?"true":void 0,"data-value":this.opts.value.current,"data-disabled":boolToEmptyStrOrUndef(this.opts.disabled.current),"data-highlighted":this.root.highlightedValue===this.opts.value.current&&!this.opts.disabled.
-current?"":void 0,"data-selected":this.root.includesItem(this.opts.value.current)?"":void 0,"data-label":this.opts.label.current,[this.root.getBitsAttr("item")]:"",onpointermove:this.onpointermove,onpointerdown:this.onpointerdown,onpointerup:this.onpointerup,...this.attachment}));get props(){return get$3(this.#props)}set props(value){set$1(this.#props,value)}}class SelectHiddenInputState{static create(opts){return new SelectHiddenInputState(opts,SelectRootContext.get())}opts;root;#shouldRender=user_derived(
-()=>this.root.opts.name.current!=="");get shouldRender(){return get$3(this.#shouldRender)}set shouldRender(value){set$1(this.#shouldRender,value)}constructor(opts,root2){this.opts=opts,this.root=root2,this.onfocus=this.onfocus.bind(this)}onfocus(e){e.preventDefault(),this.root.isCombobox?this.root.inputNode?.focus():this.root.triggerNode?.focus()}#props=user_derived(()=>({disabled:boolToTrueOrUndef(this.root.opts.disabled.current),required:boolToTrueOrUndef(this.root.opts.required.current),name:this.
-root.opts.name.current,value:this.opts.value.current,onfocus:this.onfocus}));get props(){return get$3(this.#props)}set props(value){set$1(this.#props,value)}}class SelectViewportState{static create(opts){return new SelectViewportState(opts,SelectContentContext.get())}opts;content;root;attachment;#prevScrollTop=state$1(0);get prevScrollTop(){return get$3(this.#prevScrollTop)}set prevScrollTop(value){set$1(this.#prevScrollTop,value,!0)}constructor(opts,content2){this.opts=opts,this.content=content2,
-this.root=content2.root,this.attachment=attachRef(opts.ref,v=>{this.root.viewportNode=v})}#props=user_derived(()=>({id:this.opts.id.current,role:"presentation",[this.root.getBitsAttr("viewport")]:"",style:{position:"relative",flex:1,overflow:"auto"},...this.attachment}));get props(){return get$3(this.#props)}set props(value){set$1(this.#props,value)}}class SelectScrollButtonImplState{opts;content;root;attachment;autoScrollTimer=null;userScrollTimer=-1;isUserScrolling=!1;onAutoScroll=noop$1;#mounted=state$1(
-!1);get mounted(){return get$3(this.#mounted)}set mounted(value){set$1(this.#mounted,value,!0)}constructor(opts,content2){this.opts=opts,this.content=content2,this.root=content2.root,this.attachment=attachRef(opts.ref),watch$1([()=>this.mounted],()=>{if(!this.mounted){this.isUserScrolling=!1;return}this.isUserScrolling}),user_effect(()=>{this.mounted||this.clearAutoScrollInterval()}),this.onpointerdown=this.onpointerdown.bind(this),this.onpointermove=this.onpointermove.bind(this),this.onpointerleave=
-this.onpointerleave.bind(this)}handleUserScroll(){this.content.domContext.clearTimeout(this.userScrollTimer),this.isUserScrolling=!0,this.userScrollTimer=this.content.domContext.setTimeout(()=>{this.isUserScrolling=!1},200)}clearAutoScrollInterval(){this.autoScrollTimer!==null&&(this.content.domContext.clearTimeout(this.autoScrollTimer),this.autoScrollTimer=null)}onpointerdown(_){if(this.autoScrollTimer!==null)return;const autoScroll=tick2=>{this.onAutoScroll(),this.autoScrollTimer=this.content.
-domContext.setTimeout(()=>autoScroll(tick2+1),this.opts.delay.current(tick2))};this.autoScrollTimer=this.content.domContext.setTimeout(()=>autoScroll(1),this.opts.delay.current(0))}onpointermove(e){this.onpointerdown(e)}onpointerleave(_){this.clearAutoScrollInterval()}#props=user_derived(()=>({id:this.opts.id.current,"aria-hidden":boolToStrTrueOrUndef(!0),style:{flexShrink:0},onpointerdown:this.onpointerdown,onpointermove:this.onpointermove,onpointerleave:this.onpointerleave,...this.attachment}));get props(){
-return get$3(this.#props)}set props(value){set$1(this.#props,value)}}class SelectScrollDownButtonState{static create(opts){return new SelectScrollDownButtonState(new SelectScrollButtonImplState(opts,SelectContentContext.get()))}scrollButtonState;content;root;#canScrollDown=state$1(!1);get canScrollDown(){return get$3(this.#canScrollDown)}set canScrollDown(value){set$1(this.#canScrollDown,value,!0)}scrollIntoViewTimer=null;constructor(scrollButtonState){this.scrollButtonState=scrollButtonState,this.
-content=scrollButtonState.content,this.root=scrollButtonState.root,this.scrollButtonState.onAutoScroll=this.handleAutoScroll,watch$1([()=>this.root.viewportNode,()=>this.content.isPositioned],()=>{if(!(!this.root.viewportNode||!this.content.isPositioned))return this.handleScroll(!0),on(this.root.viewportNode,"scroll",()=>this.handleScroll())}),watch$1([()=>this.root.opts.inputValue.current,()=>this.root.viewportNode,()=>this.content.isPositioned],()=>{!this.root.viewportNode||!this.content.isPositioned||
-this.handleScroll(!0)}),watch$1(()=>this.scrollButtonState.mounted,()=>{this.scrollButtonState.mounted&&(this.scrollIntoViewTimer&&clearTimeout(this.scrollIntoViewTimer),this.scrollIntoViewTimer=afterSleep(5,()=>{const activeItem=this.root.highlightedNode;activeItem&&this.root.scrollHighlightedNodeIntoView(activeItem)}))})}handleScroll=(manual=!1)=>{if(manual||this.scrollButtonState.handleUserScroll(),!this.root.viewportNode)return;const maxScroll=this.root.viewportNode.scrollHeight-this.root.viewportNode.
-clientHeight,paddingTop=Number.parseInt(getComputedStyle(this.root.viewportNode).paddingTop,10);this.canScrollDown=Math.ceil(this.root.viewportNode.scrollTop)<maxScroll-paddingTop};handleAutoScroll=()=>{const viewport=this.root.viewportNode,selectedItem=this.root.highlightedNode;!viewport||!selectedItem||(viewport.scrollTop=viewport.scrollTop+selectedItem.offsetHeight)};#props=user_derived(()=>({...this.scrollButtonState.props,[this.root.getBitsAttr("scroll-down-button")]:""}));get props(){return get$3(
-this.#props)}set props(value){set$1(this.#props,value)}}class SelectScrollUpButtonState{static create(opts){return new SelectScrollUpButtonState(new SelectScrollButtonImplState(opts,SelectContentContext.get()))}scrollButtonState;content;root;#canScrollUp=state$1(!1);get canScrollUp(){return get$3(this.#canScrollUp)}set canScrollUp(value){set$1(this.#canScrollUp,value,!0)}constructor(scrollButtonState){this.scrollButtonState=scrollButtonState,this.content=scrollButtonState.content,this.root=scrollButtonState.
-root,this.scrollButtonState.onAutoScroll=this.handleAutoScroll,watch$1([()=>this.root.viewportNode,()=>this.content.isPositioned],()=>{if(!(!this.root.viewportNode||!this.content.isPositioned))return this.handleScroll(!0),on(this.root.viewportNode,"scroll",()=>this.handleScroll())})}handleScroll=(manual=!1)=>{if(manual||this.scrollButtonState.handleUserScroll(),!this.root.viewportNode)return;const paddingTop=Number.parseInt(getComputedStyle(this.root.viewportNode).paddingTop,10);this.canScrollUp=
-this.root.viewportNode.scrollTop-paddingTop>.1};handleAutoScroll=()=>{!this.root.viewportNode||!this.root.highlightedNode||(this.root.viewportNode.scrollTop=this.root.viewportNode.scrollTop-this.root.highlightedNode.offsetHeight)};#props=user_derived(()=>({...this.scrollButtonState.props,[this.root.getBitsAttr("scroll-up-button")]:""}));get props(){return get$3(this.#props)}set props(value){set$1(this.#props,value)}}function Select_hidden_input($$anchor,$$props){push$1($$props,!0);let value=prop(
-$$props,"value",15);const hiddenInputState=SelectHiddenInputState.create({value:boxWith$1(()=>value())});var fragment=comment$2(),node2=first_child(fragment);{var consequent=$$anchor2=>{Hidden_input($$anchor2,spread_props(()=>hiddenInputState.props,{get autocomplete(){return $$props.autocomplete},get value(){return value()},set value($$value){value($$value)}}))};if_block(node2,$$render=>{hiddenInputState.shouldRender&&$$render(consequent)})}append($$anchor,fragment),pop()}function Floating_layer_anchor($$anchor,$$props){
-push$1($$props,!0);let tooltip=prop($$props,"tooltip",3,!1);FloatingAnchorState.create({id:boxWith$1(()=>$$props.id),virtualEl:boxWith$1(()=>$$props.virtualEl),ref:$$props.ref},tooltip());var fragment=comment$2(),node2=first_child(fragment);snippet(node2,()=>$$props.children??noop$3),append($$anchor,fragment),pop()}var root_4$K=from_svg('<svg viewBox="0 0 30 10" preserveAspectRatio="none" data-arrow=""><polygon points="0,0 30,0 15,10" fill="currentColor"></polygon></svg>'),root_2$1t=from_html("<\
-span><!></span>");function Arrow($$anchor,$$props){push$1($$props,!0);let id2=prop($$props,"id",19,useId),width=prop($$props,"width",3,10),height=prop($$props,"height",3,5),restProps=rest_props($$props,["$$slots","$$events","$$legacy","id","children","child","width","height"]);const mergedProps=user_derived(()=>mergeProps(restProps,{id:id2()}));var fragment=comment$2(),node2=first_child(fragment);{var consequent=$$anchor2=>{var fragment_1=comment$2(),node_1=first_child(fragment_1);snippet(node_1,
-()=>$$props.child,()=>({props:get$3(mergedProps)})),append($$anchor2,fragment_1)},alternate_1=$$anchor2=>{var span=root_2$1t();attribute_effect(span,()=>({...get$3(mergedProps)}));var node_2=child(span);{var consequent_1=$$anchor3=>{var fragment_2=comment$2(),node_3=first_child(fragment_2);snippet(node_3,()=>$$props.children??noop$3),append($$anchor3,fragment_2)},alternate=$$anchor3=>{var svg2=root_4$K();template_effect(()=>{set_attribute(svg2,"width",width()),set_attribute(svg2,"height",height())}),
-append($$anchor3,svg2)};if_block(node_2,$$render=>{$$props.children?$$render(consequent_1):$$render(alternate,-1)})}reset(span),append($$anchor2,span)};if_block(node2,$$render=>{$$props.child?$$render(consequent):$$render(alternate_1,-1)})}append($$anchor,fragment),pop()}function Floating_layer_arrow($$anchor,$$props){push$1($$props,!0);let id2=prop($$props,"id",19,useId),ref2=prop($$props,"ref",15,null),restProps=rest_props($$props,["$$slots","$$events","$$legacy","id","ref"]);const arrowState=FloatingArrowState.
-create({id:boxWith$1(()=>id2()),ref:boxWith$1(()=>ref2(),v=>ref2(v))}),mergedProps=user_derived(()=>mergeProps(restProps,arrowState.props));Arrow($$anchor,spread_props(()=>get$3(mergedProps))),pop()}function Floating_layer_content($$anchor,$$props){push$1($$props,!0);let side=prop($$props,"side",3,"bottom"),sideOffset=prop($$props,"sideOffset",3,0),align=prop($$props,"align",3,"center"),alignOffset=prop($$props,"alignOffset",3,0),arrowPadding=prop($$props,"arrowPadding",3,0),avoidCollisions=prop(
-$$props,"avoidCollisions",3,!0),collisionBoundary=prop($$props,"collisionBoundary",19,()=>[]),collisionPadding=prop($$props,"collisionPadding",3,0),hideWhenDetached=prop($$props,"hideWhenDetached",3,!1),onPlaced=prop($$props,"onPlaced",3,()=>{}),sticky=prop($$props,"sticky",3,"partial"),updatePositionStrategy=prop($$props,"updatePositionStrategy",3,"optimized"),strategy=prop($$props,"strategy",3,"fixed"),dir=prop($$props,"dir",3,"ltr"),style2=prop($$props,"style",19,()=>({})),wrapperId=prop($$props,
-"wrapperId",19,useId),customAnchor=prop($$props,"customAnchor",3,null),tooltip=prop($$props,"tooltip",3,!1);const contentState=FloatingContentState.create({side:boxWith$1(()=>side()),sideOffset:boxWith$1(()=>sideOffset()),align:boxWith$1(()=>align()),alignOffset:boxWith$1(()=>alignOffset()),id:boxWith$1(()=>$$props.id),arrowPadding:boxWith$1(()=>arrowPadding()),avoidCollisions:boxWith$1(()=>avoidCollisions()),collisionBoundary:boxWith$1(()=>collisionBoundary()),collisionPadding:boxWith$1(()=>collisionPadding()),
-hideWhenDetached:boxWith$1(()=>hideWhenDetached()),onPlaced:boxWith$1(()=>onPlaced()),sticky:boxWith$1(()=>sticky()),updatePositionStrategy:boxWith$1(()=>updatePositionStrategy()),strategy:boxWith$1(()=>strategy()),dir:boxWith$1(()=>dir()),style:boxWith$1(()=>style2()),enabled:boxWith$1(()=>$$props.enabled),wrapperId:boxWith$1(()=>wrapperId()),customAnchor:boxWith$1(()=>customAnchor())},tooltip()),mergedProps=user_derived(()=>mergeProps(contentState.wrapperProps,{style:{pointerEvents:"auto"}}));
-var fragment=comment$2(),node2=first_child(fragment);snippet(node2,()=>$$props.content??noop$3,()=>({props:contentState.props,wrapperProps:get$3(mergedProps)})),append($$anchor,fragment),pop()}function Floating_layer_content_static($$anchor,$$props){push$1($$props,!0),onMount$1(()=>{$$props.onPlaced?.()});var fragment=comment$2(),node2=first_child(fragment);snippet(node2,()=>$$props.content??noop$3,()=>({props:{},wrapperProps:{}})),append($$anchor,fragment),pop()}function Popper_content($$anchor,$$props){
-let isStatic=prop($$props,"isStatic",3,!1),restProps=rest_props($$props,["$$slots","$$events","$$legacy","content","isStatic","onPlaced"]);var fragment=comment$2(),node2=first_child(fragment);{var consequent=$$anchor2=>{Floating_layer_content_static($$anchor2,{get content(){return $$props.content},get onPlaced(){return $$props.onPlaced}})},alternate=$$anchor2=>{Floating_layer_content($$anchor2,spread_props({get content(){return $$props.content},get onPlaced(){return $$props.onPlaced}},()=>restProps))};
-if_block(node2,$$render=>{isStatic()?$$render(consequent):$$render(alternate,-1)})}append($$anchor,fragment)}var root_1$19=from_html("<!> <!>",1);function Popper_layer_inner($$anchor,$$props){push$1($$props,!0);let interactOutsideBehavior=prop($$props,"interactOutsideBehavior",3,"close"),trapFocus=prop($$props,"trapFocus",3,!0),isValidEvent2=prop($$props,"isValidEvent",3,()=>!1),customAnchor=prop($$props,"customAnchor",3,null),isStatic=prop($$props,"isStatic",3,!1),tooltip=prop($$props,"tooltip",
-3,!1),contentPointerEvents=prop($$props,"contentPointerEvents",3,"auto"),restProps=rest_props($$props,["$$slots","$$events","$$legacy","popper","onEscapeKeydown","escapeKeydownBehavior","preventOverflowTextSelection","id","onPointerDown","onPointerUp","side","sideOffset","align","alignOffset","arrowPadding","avoidCollisions","collisionBoundary","collisionPadding","sticky","hideWhenDetached","updatePositionStrategy","strategy","dir","preventScroll","wrapperId","style","onPlaced","onInteractOutsid\
-e","onCloseAutoFocus","onOpenAutoFocus","onFocusOutside","interactOutsideBehavior","loop","trapFocus","isValidEvent","customAnchor","isStatic","enabled","ref","tooltip","contentPointerEvents"]);const resolvedPreventScroll=user_derived(()=>$$props.preventScroll??!0),effectiveStrategy=user_derived(()=>$$props.strategy??(get$3(resolvedPreventScroll)?"fixed":"absolute"));Popper_content($$anchor,{get isStatic(){return isStatic()},get id(){return $$props.id},get side(){return $$props.side},get sideOffset(){
-return $$props.sideOffset},get align(){return $$props.align},get alignOffset(){return $$props.alignOffset},get arrowPadding(){return $$props.arrowPadding},get avoidCollisions(){return $$props.avoidCollisions},get collisionBoundary(){return $$props.collisionBoundary},get collisionPadding(){return $$props.collisionPadding},get sticky(){return $$props.sticky},get hideWhenDetached(){return $$props.hideWhenDetached},get updatePositionStrategy(){return $$props.updatePositionStrategy},get strategy(){return get$3(
-effectiveStrategy)},get dir(){return $$props.dir},get wrapperId(){return $$props.wrapperId},get style(){return $$props.style},get onPlaced(){return $$props.onPlaced},get customAnchor(){return customAnchor()},get enabled(){return $$props.enabled},get tooltip(){return tooltip()},content:($$anchor2,$$arg0)=>{let floatingProps=()=>$$arg0?.().props,wrapperProps=()=>$$arg0?.().wrapperProps;var fragment_1=root_1$19(),node2=first_child(fragment_1);{var consequent=$$anchor3=>{Scroll_lock($$anchor3,{get preventScroll(){
-return get$3(resolvedPreventScroll)}})},consequent_1=$$anchor3=>{Scroll_lock($$anchor3,{get preventScroll(){return get$3(resolvedPreventScroll)}})};if_block(node2,$$render=>{$$props.forceMount&&$$props.enabled?$$render(consequent):$$props.forceMount||$$render(consequent_1,1)})}var node_1=sibling(node2,2);Focus_scope(node_1,{get onOpenAutoFocus(){return $$props.onOpenAutoFocus},get onCloseAutoFocus(){return $$props.onCloseAutoFocus},get loop(){return $$props.loop},get enabled(){return $$props.enabled},
-get trapFocus(){return trapFocus()},get forceMount(){return $$props.forceMount},get ref(){return $$props.ref},focusScope:($$anchor3,$$arg02)=>{let focusScopeProps=()=>$$arg02?.().props;Escape_layer($$anchor3,{get onEscapeKeydown(){return $$props.onEscapeKeydown},get escapeKeydownBehavior(){return $$props.escapeKeydownBehavior},get enabled(){return $$props.enabled},get ref(){return $$props.ref},children:($$anchor4,$$slotProps)=>{Dismissible_layer($$anchor4,{get id(){return $$props.id},get onInteractOutside(){
-return $$props.onInteractOutside},get onFocusOutside(){return $$props.onFocusOutside},get interactOutsideBehavior(){return interactOutsideBehavior()},get isValidEvent(){return isValidEvent2()},get enabled(){return $$props.enabled},get ref(){return $$props.ref},children:($$anchor5,$$arg03)=>{let dismissibleProps=()=>$$arg03?.().props;Text_selection_layer($$anchor5,{get id(){return $$props.id},get preventOverflowTextSelection(){return $$props.preventOverflowTextSelection},get onPointerDown(){return $$props.
-onPointerDown},get onPointerUp(){return $$props.onPointerUp},get enabled(){return $$props.enabled},get ref(){return $$props.ref},children:($$anchor6,$$slotProps2)=>{var fragment_7=comment$2(),node_2=first_child(fragment_7);{let $0=user_derived(()=>({props:mergeProps(restProps,floatingProps(),dismissibleProps(),focusScopeProps(),{style:{pointerEvents:contentPointerEvents()}}),wrapperProps:wrapperProps()}));snippet(node_2,()=>$$props.popper??noop$3,()=>get$3($0))}append($$anchor6,fragment_7)},$$slots:{
-default:!0}})},$$slots:{default:!0}})},$$slots:{default:!0}})},$$slots:{focusScope:!0}}),append($$anchor2,fragment_1)},$$slots:{content:!0}}),pop()}function Popper_layer($$anchor,$$props){let interactOutsideBehavior=prop($$props,"interactOutsideBehavior",3,"close"),trapFocus=prop($$props,"trapFocus",3,!0),isValidEvent2=prop($$props,"isValidEvent",3,()=>!1),customAnchor=prop($$props,"customAnchor",3,null),isStatic=prop($$props,"isStatic",3,!1),restProps=rest_props($$props,["$$slots","$$events","$\
-$legacy","popper","open","onEscapeKeydown","escapeKeydownBehavior","preventOverflowTextSelection","id","onPointerDown","onPointerUp","side","sideOffset","align","alignOffset","arrowPadding","avoidCollisions","collisionBoundary","collisionPadding","sticky","hideWhenDetached","updatePositionStrategy","strategy","dir","preventScroll","wrapperId","style","onPlaced","onInteractOutside","onCloseAutoFocus","onOpenAutoFocus","onFocusOutside","interactOutsideBehavior","loop","trapFocus","isValidEvent","c\
-ustomAnchor","isStatic","ref","shouldRender"]);var fragment=comment$2(),node2=first_child(fragment);{var consequent=$$anchor2=>{Popper_layer_inner($$anchor2,spread_props({get popper(){return $$props.popper},get onEscapeKeydown(){return $$props.onEscapeKeydown},get escapeKeydownBehavior(){return $$props.escapeKeydownBehavior},get preventOverflowTextSelection(){return $$props.preventOverflowTextSelection},get id(){return $$props.id},get onPointerDown(){return $$props.onPointerDown},get onPointerUp(){
-return $$props.onPointerUp},get side(){return $$props.side},get sideOffset(){return $$props.sideOffset},get align(){return $$props.align},get alignOffset(){return $$props.alignOffset},get arrowPadding(){return $$props.arrowPadding},get avoidCollisions(){return $$props.avoidCollisions},get collisionBoundary(){return $$props.collisionBoundary},get collisionPadding(){return $$props.collisionPadding},get sticky(){return $$props.sticky},get hideWhenDetached(){return $$props.hideWhenDetached},get updatePositionStrategy(){
+this.#contentNode)}set contentNode(value){set$1(this.#contentNode,value,!0)}contentPresence;#viewportNode=state$1(null);get viewportNode(){return get$3(this.#viewportNode)}set viewportNode(value){set$1(this.#viewportNode,value,!0)}#triggerNode=state$1(null);get triggerNode(){return get$3(this.#triggerNode)}set triggerNode(value){set$1(this.#triggerNode,value,!0)}#valueNode=state$1(null);get valueNode(){return get$3(this.#valueNode)}set valueNode(value){set$1(this.#valueNode,value,!0)}#valueId=state$1(
+"");get valueId(){return get$3(this.#valueId)}set valueId(value){set$1(this.#valueId,value,!0)}#highlightedNode=state$1(null);get highlightedNode(){return get$3(this.#highlightedNode)}set highlightedNode(value){set$1(this.#highlightedNode,value,!0)}#highlightedValue=user_derived(()=>this.highlightedNode?this.highlightedNode.getAttribute("data-value"):null);get highlightedValue(){return get$3(this.#highlightedValue)}set highlightedValue(value){set$1(this.#highlightedValue,value)}#highlightedId=user_derived(
+()=>{if(this.highlightedNode)return this.highlightedNode.id});get highlightedId(){return get$3(this.#highlightedId)}set highlightedId(value){set$1(this.#highlightedId,value)}#highlightedLabel=user_derived(()=>this.highlightedNode?this.highlightedNode.getAttribute("data-label"):null);get highlightedLabel(){return get$3(this.#highlightedLabel)}set highlightedLabel(value){set$1(this.#highlightedLabel,value)}#contentIsPositioned=state$1(!1);get contentIsPositioned(){return get$3(this.#contentIsPositioned)}set contentIsPositioned(value){
+set$1(this.#contentIsPositioned,value,!0)}isUsingKeyboard=!1;isCombobox=!1;domContext=new DOMContext(()=>null);constructor(opts){this.opts=opts,this.isCombobox=opts.isCombobox,this.contentPresence=new PresenceManager({ref:boxWith$1(()=>this.contentNode),open:this.opts.open,onComplete:()=>{this.opts.onOpenChangeComplete.current(this.opts.open.current)}}),user_pre_effect(()=>{this.opts.open.current||this.setHighlightedNode(null)})}setHighlightedNode(node2,initial=!1){this.highlightedNode=node2,node2&&
+(this.isUsingKeyboard||initial)&&this.scrollHighlightedNodeIntoView(node2)}scrollHighlightedNodeIntoView(node2){!this.viewportNode||!this.contentIsPositioned||node2.scrollIntoView({block:this.opts.scrollAlignment.current})}getCandidateNodes(){const node2=this.contentNode;return node2?Array.from(node2.querySelectorAll(`[${this.getBitsAttr("item")}]:not([data-disabled])`)):[]}setHighlightedToFirstCandidate(initial=!1){this.setHighlightedNode(null);let nodes2=this.getCandidateNodes();if(nodes2.length){
+if(this.viewportNode){const viewportRect=this.viewportNode.getBoundingClientRect();nodes2=nodes2.filter(node2=>{if(!this.viewportNode)return!1;const nodeRect=node2.getBoundingClientRect();return nodeRect.right<=viewportRect.right&&nodeRect.left>=viewportRect.left&&nodeRect.bottom<=viewportRect.bottom&&nodeRect.top>=viewportRect.top})}this.setHighlightedNode(nodes2[0],initial)}}getNodeByValue(value){return this.getCandidateNodes().find(node2=>node2.dataset.value===value)??null}getLabelForValue(value){
+if(value==="")return"";const fromItems=this.opts.items.current.find(item=>item.value===value)?.label;if(fromItems!==void 0)return fromItems;const node2=this.getNodeByValue(value);if(node2){const dataLabel=node2.getAttribute("data-label");return dataLabel!==null&&dataLabel!==""?dataLabel:node2.textContent?.trim()??value}return value}setOpen(open2){this.opts.open.current=open2}toggleOpen(){this.opts.open.current=!this.opts.open.current}handleOpen(){this.setOpen(!0)}handleClose(){this.setHighlightedNode(
+null),this.setOpen(!1)}toggleMenu(){this.toggleOpen()}getBitsAttr=part=>selectAttrs.getAttr(part,this.isCombobox?"combobox":void 0)}class SelectSingleRootState extends SelectBaseRootState{opts;isMulti=!1;#hasValue=user_derived(()=>this.opts.value.current!=="");get hasValue(){return get$3(this.#hasValue)}set hasValue(value){set$1(this.#hasValue,value)}#currentLabel=user_derived(()=>this.opts.items.current.length?this.opts.items.current.find(item=>item.value===this.opts.value.current)?.label??"":"");get currentLabel(){
+return get$3(this.#currentLabel)}set currentLabel(value){set$1(this.#currentLabel,value)}#candidateLabels=user_derived(()=>this.opts.items.current.length?this.opts.items.current.filter(item=>!item.disabled).map(item=>item.label):[]);get candidateLabels(){return get$3(this.#candidateLabels)}set candidateLabels(value){set$1(this.#candidateLabels,value)}#dataTypeaheadEnabled=user_derived(()=>!(this.isMulti||this.opts.items.current.length===0));get dataTypeaheadEnabled(){return get$3(this.#dataTypeaheadEnabled)}set dataTypeaheadEnabled(value){
+set$1(this.#dataTypeaheadEnabled,value)}constructor(opts){super(opts),this.opts=opts,user_effect(()=>{!this.opts.open.current&&this.highlightedNode&&this.setHighlightedNode(null)}),watch$1(()=>this.opts.open.current,()=>{this.opts.open.current&&this.setInitialHighlightedNode()})}includesItem(itemValue){return this.opts.value.current===itemValue}toggleItem(itemValue,itemLabel=itemValue){const newValue=this.includesItem(itemValue)?"":itemValue;this.opts.value.current=newValue,newValue!==""&&(this.
+opts.inputValue.current=itemLabel)}setInitialHighlightedNode(){afterTick(()=>{if(!(this.highlightedNode&&this.domContext.getDocument().contains(this.highlightedNode))){if(this.opts.value.current!==""){const node2=this.getNodeByValue(this.opts.value.current);if(node2){this.setHighlightedNode(node2,!0);return}}this.setHighlightedToFirstCandidate(!0)}})}}class SelectMultipleRootState extends SelectBaseRootState{opts;isMulti=!0;#hasValue=user_derived(()=>this.opts.value.current.length>0);get hasValue(){
+return get$3(this.#hasValue)}set hasValue(value){set$1(this.#hasValue,value)}constructor(opts){super(opts),this.opts=opts,user_effect(()=>{!this.opts.open.current&&this.highlightedNode&&this.setHighlightedNode(null)}),watch$1(()=>this.opts.open.current,()=>{this.opts.open.current&&this.setInitialHighlightedNode()})}includesItem(itemValue){return this.opts.value.current.includes(itemValue)}toggleItem(itemValue,itemLabel=itemValue){this.includesItem(itemValue)?this.opts.value.current=this.opts.value.
+current.filter(v=>v!==itemValue):this.opts.value.current=[...this.opts.value.current,itemValue],this.opts.inputValue.current=itemLabel}setInitialHighlightedNode(){afterTick(()=>{if(this.domContext&&!(this.highlightedNode&&this.domContext.getDocument().contains(this.highlightedNode))){if(this.opts.value.current.length&&this.opts.value.current[0]!==""){const node2=this.getNodeByValue(this.opts.value.current[0]);if(node2){this.setHighlightedNode(node2,!0);return}}this.setHighlightedToFirstCandidate(
+!0)}})}}class SelectRootState{static create(props){const{type:type2,...rest}=props,rootState=type2==="single"?new SelectSingleRootState(rest):new SelectMultipleRootState(rest);return SelectRootContext.set(rootState)}}class SelectTriggerState{static create(opts){return new SelectTriggerState(opts,SelectRootContext.get())}opts;root;attachment;#domTypeahead;#dataTypeahead;constructor(opts,root2){this.opts=opts,this.root=root2,this.attachment=attachRef(opts.ref,v=>this.root.triggerNode=v),this.root.
+domContext=new DOMContext(opts.ref),this.#domTypeahead=new DOMTypeahead({getCurrentItem:()=>this.root.highlightedNode,onMatch:node2=>{this.root.setHighlightedNode(node2)},getActiveElement:()=>this.root.domContext.getActiveElement(),getWindow:()=>this.root.domContext.getWindow()}),this.#dataTypeahead=new DataTypeahead({getCurrentItem:()=>this.root.isMulti?"":this.root.currentLabel,onMatch:label=>{if(this.root.isMulti||!this.root.opts.items.current)return;const matchedItem=this.root.opts.items.current.
+find(item=>item.label===label);matchedItem&&(this.root.opts.value.current=matchedItem.value)},enabled:()=>!this.root.isMulti&&this.root.dataTypeaheadEnabled,candidateValues:()=>this.root.isMulti?[]:this.root.candidateLabels,getWindow:()=>this.root.domContext.getWindow()}),this.onkeydown=this.onkeydown.bind(this),this.onpointerdown=this.onpointerdown.bind(this),this.onpointerup=this.onpointerup.bind(this),this.onclick=this.onclick.bind(this)}#handleOpen(){this.root.opts.open.current=!0,this.#dataTypeahead.
+resetTypeahead(),this.#domTypeahead.resetTypeahead()}#handlePointerOpen(_){this.#handleOpen()}#handleKeyboardSelection(){const isCurrentSelectedValue=this.root.highlightedValue===this.root.opts.value.current;return!this.root.opts.allowDeselect.current&&isCurrentSelectedValue&&!this.root.isMulti?(this.root.handleClose(),!0):(this.root.highlightedValue!==null&&this.root.toggleItem(this.root.highlightedValue,this.root.highlightedLabel??void 0),!this.root.isMulti&&!isCurrentSelectedValue?(this.root.
+handleClose(),!0):!1)}onkeydown(e){if(this.root.isUsingKeyboard=!0,(e.key===ARROW_UP||e.key===ARROW_DOWN)&&e.preventDefault(),!this.root.opts.open.current){if(e.key===ENTER||e.key===SPACE||e.key===ARROW_DOWN||e.key===ARROW_UP)e.preventDefault(),this.root.handleOpen();else if(!this.root.isMulti&&this.root.dataTypeaheadEnabled){this.#dataTypeahead.handleTypeaheadSearch(e.key);return}if(this.root.hasValue)return;const candidateNodes2=this.root.getCandidateNodes();if(!candidateNodes2.length)return;if(e.
+key===ARROW_DOWN){const firstCandidate=candidateNodes2[0];this.root.setHighlightedNode(firstCandidate)}else if(e.key===ARROW_UP){const lastCandidate=candidateNodes2[candidateNodes2.length-1];this.root.setHighlightedNode(lastCandidate)}return}if(e.key===TAB){this.root.handleClose();return}if((e.key===ENTER||e.key===SPACE&&this.#domTypeahead.search==="")&&!e.isComposing&&(e.preventDefault(),this.#handleKeyboardSelection()))return;if(e.key===ARROW_UP&&e.altKey&&this.root.handleClose(),FIRST_LAST_KEYS.
+includes(e.key)){e.preventDefault();const candidateNodes2=this.root.getCandidateNodes(),currHighlightedNode=this.root.highlightedNode,currIndex=currHighlightedNode?candidateNodes2.indexOf(currHighlightedNode):-1,loop2=this.root.opts.loop.current;let nextItem;if(e.key===ARROW_DOWN?nextItem=next(candidateNodes2,currIndex,loop2):e.key===ARROW_UP?nextItem=prev(candidateNodes2,currIndex,loop2):e.key===PAGE_DOWN?nextItem=forward(candidateNodes2,currIndex,10,loop2):e.key===PAGE_UP?nextItem=backward(candidateNodes2,
+currIndex,10,loop2):e.key===HOME?nextItem=candidateNodes2[0]:e.key===END&&(nextItem=candidateNodes2[candidateNodes2.length-1]),!nextItem)return;this.root.setHighlightedNode(nextItem);return}const isModifierKey=e.ctrlKey||e.altKey||e.metaKey,isCharacterKey=e.key.length===1,isSpaceKey=e.key===SPACE,candidateNodes=this.root.getCandidateNodes();if(e.key!==TAB){if(!isModifierKey&&(isCharacterKey||isSpaceKey)){!this.#domTypeahead.handleTypeaheadSearch(e.key,candidateNodes)&&isSpaceKey&&(e.preventDefault(),
+this.#handleKeyboardSelection());return}this.root.highlightedNode||this.root.setHighlightedToFirstCandidate()}}onclick(e){e.currentTarget.focus()}onpointerdown(e){if(this.root.opts.disabled.current)return;if(e.pointerType==="touch")return e.preventDefault();const target2=e.target;target2?.hasPointerCapture(e.pointerId)&&target2?.releasePointerCapture(e.pointerId),e.button===0&&e.ctrlKey===!1&&(this.root.opts.open.current===!1?this.#handlePointerOpen(e):this.root.handleClose())}onpointerup(e){this.
+root.opts.disabled.current||(e.preventDefault(),e.pointerType==="touch"&&(this.root.opts.open.current===!1?this.#handlePointerOpen(e):this.root.handleClose()))}#props=user_derived(()=>({id:this.opts.id.current,disabled:this.root.opts.disabled.current?!0:void 0,"aria-haspopup":"listbox","aria-expanded":boolToStr(this.root.opts.open.current),"aria-activedescendant":this.root.highlightedId,"data-state":getDataOpenClosed(this.root.opts.open.current),"data-disabled":boolToEmptyStrOrUndef(this.root.opts.
+disabled.current),"data-placeholder":this.root.hasValue?void 0:"",[this.root.getBitsAttr("trigger")]:"",onpointerdown:this.onpointerdown,onkeydown:this.onkeydown,onclick:this.onclick,onpointerup:this.onpointerup,...this.attachment}));get props(){return get$3(this.#props)}set props(value){set$1(this.#props,value)}}class SelectContentState{static create(opts){return SelectContentContext.set(new SelectContentState(opts,SelectRootContext.get()))}opts;root;attachment;#isPositioned=state$1(!1);get isPositioned(){
+return get$3(this.#isPositioned)}set isPositioned(value){set$1(this.#isPositioned,value,!0)}domContext;constructor(opts,root2){this.opts=opts,this.root=root2,this.attachment=attachRef(opts.ref,v=>this.root.contentNode=v),this.domContext=new DOMContext(this.opts.ref),this.root.domContext===null&&(this.root.domContext=this.domContext),onDestroyEffect(()=>{this.root.contentNode=null,this.root.contentIsPositioned=!1,this.isPositioned=!1}),watch$1(()=>this.root.opts.open.current,()=>{this.root.opts.open.
+current||(this.root.contentIsPositioned=!1,this.isPositioned=!1)}),watch$1([()=>this.isPositioned,()=>this.root.highlightedNode],()=>{!this.isPositioned||!this.root.highlightedNode||this.root.scrollHighlightedNodeIntoView(this.root.highlightedNode)}),this.onpointermove=this.onpointermove.bind(this)}onpointermove(_){this.root.isUsingKeyboard=!1}#styles=user_derived(()=>getFloatingContentCSSVars(this.root.isCombobox?"combobox":"select"));onInteractOutside=e=>{if(e.target===this.root.triggerNode||e.
+target===this.root.inputNode){e.preventDefault();return}this.opts.onInteractOutside.current(e),!e.defaultPrevented&&this.root.handleClose()};onEscapeKeydown=e=>{this.opts.onEscapeKeydown.current(e),!e.defaultPrevented&&this.root.handleClose()};onOpenAutoFocus=e=>{e.preventDefault()};onCloseAutoFocus=e=>{e.preventDefault()};get shouldRender(){return this.root.contentPresence.shouldRender}#snippetProps=user_derived(()=>({open:this.root.opts.open.current}));get snippetProps(){return get$3(this.#snippetProps)}set snippetProps(value){
+set$1(this.#snippetProps,value)}#props=user_derived(()=>({id:this.opts.id.current,role:"listbox","aria-multiselectable":this.root.isMulti?"true":void 0,"data-state":getDataOpenClosed(this.root.opts.open.current),...getDataTransitionAttrs(this.root.contentPresence.transitionStatus),[this.root.getBitsAttr("content")]:"",style:{display:"flex",flexDirection:"column",outline:"none",boxSizing:"border-box",pointerEvents:"auto",...get$3(this.#styles)},onpointermove:this.onpointermove,...this.attachment}));get props(){
+return get$3(this.#props)}set props(value){set$1(this.#props,value)}popperProps={onInteractOutside:this.onInteractOutside,onEscapeKeydown:this.onEscapeKeydown,onOpenAutoFocus:this.onOpenAutoFocus,onCloseAutoFocus:this.onCloseAutoFocus,trapFocus:!1,loop:!1,onPlaced:()=>{this.root.opts.open.current&&(this.root.contentIsPositioned=!0,this.isPositioned=!0)}}}class SelectItemState{static create(opts){return new SelectItemState(opts,SelectRootContext.get())}opts;root;attachment;#isSelected=user_derived(
+()=>this.root.includesItem(this.opts.value.current));get isSelected(){return get$3(this.#isSelected)}set isSelected(value){set$1(this.#isSelected,value)}#isHighlighted=user_derived(()=>this.root.highlightedValue===this.opts.value.current);get isHighlighted(){return get$3(this.#isHighlighted)}set isHighlighted(value){set$1(this.#isHighlighted,value)}prevHighlighted=new Previous(()=>this.isHighlighted);#mounted=state$1(!1);get mounted(){return get$3(this.#mounted)}set mounted(value){set$1(this.#mounted,
+value,!0)}constructor(opts,root2){this.opts=opts,this.root=root2,this.attachment=attachRef(opts.ref),watch$1([()=>this.isHighlighted,()=>this.prevHighlighted.current],()=>{this.isHighlighted?this.opts.onHighlight.current():this.prevHighlighted.current&&this.opts.onUnhighlight.current()}),watch$1(()=>this.mounted,()=>{this.mounted&&this.root.setInitialHighlightedNode()}),this.onpointerdown=this.onpointerdown.bind(this),this.onpointerup=this.onpointerup.bind(this),this.onpointermove=this.onpointermove.
+bind(this)}handleSelect(){if(this.opts.disabled.current)return;const isCurrentSelectedValue=this.opts.value.current===this.root.opts.value.current;if(!this.root.opts.allowDeselect.current&&isCurrentSelectedValue&&!this.root.isMulti){this.root.handleClose();return}this.root.toggleItem(this.opts.value.current,this.opts.label.current),!this.root.isMulti&&!isCurrentSelectedValue&&this.root.handleClose()}#snippetProps=user_derived(()=>({selected:this.isSelected,highlighted:this.isHighlighted}));get snippetProps(){
+return get$3(this.#snippetProps)}set snippetProps(value){set$1(this.#snippetProps,value)}onpointerdown(e){e.preventDefault()}onpointerup(e){if(!(e.defaultPrevented||!this.opts.ref.current)){if(e.pointerType==="touch"&&!isIOS){on(this.opts.ref.current,"click",()=>{this.handleSelect(),this.root.setHighlightedNode(this.opts.ref.current)},{once:!0});return}e.preventDefault(),this.handleSelect(),e.pointerType==="touch"&&this.root.setHighlightedNode(this.opts.ref.current)}}onpointermove(e){e.pointerType!==
+"touch"&&this.root.highlightedNode!==this.opts.ref.current&&this.root.setHighlightedNode(this.opts.ref.current)}#props=user_derived(()=>({id:this.opts.id.current,role:"option","aria-selected":this.root.includesItem(this.opts.value.current)?"true":void 0,"data-value":this.opts.value.current,"data-disabled":boolToEmptyStrOrUndef(this.opts.disabled.current),"data-highlighted":this.root.highlightedValue===this.opts.value.current&&!this.opts.disabled.current?"":void 0,"data-selected":this.root.includesItem(
+this.opts.value.current)?"":void 0,"data-label":this.opts.label.current,[this.root.getBitsAttr("item")]:"",onpointermove:this.onpointermove,onpointerdown:this.onpointerdown,onpointerup:this.onpointerup,...this.attachment}));get props(){return get$3(this.#props)}set props(value){set$1(this.#props,value)}}class SelectHiddenInputState{static create(opts){return new SelectHiddenInputState(opts,SelectRootContext.get())}opts;root;#shouldRender=user_derived(()=>this.root.opts.name.current!=="");get shouldRender(){
+return get$3(this.#shouldRender)}set shouldRender(value){set$1(this.#shouldRender,value)}constructor(opts,root2){this.opts=opts,this.root=root2,this.onfocus=this.onfocus.bind(this)}onfocus(e){e.preventDefault(),this.root.isCombobox?this.root.inputNode?.focus():this.root.triggerNode?.focus()}#props=user_derived(()=>({disabled:boolToTrueOrUndef(this.root.opts.disabled.current),required:boolToTrueOrUndef(this.root.opts.required.current),name:this.root.opts.name.current,value:this.opts.value.current,
+onfocus:this.onfocus}));get props(){return get$3(this.#props)}set props(value){set$1(this.#props,value)}}class SelectViewportState{static create(opts){return new SelectViewportState(opts,SelectContentContext.get())}opts;content;root;attachment;#prevScrollTop=state$1(0);get prevScrollTop(){return get$3(this.#prevScrollTop)}set prevScrollTop(value){set$1(this.#prevScrollTop,value,!0)}constructor(opts,content2){this.opts=opts,this.content=content2,this.root=content2.root,this.attachment=attachRef(opts.
+ref,v=>{this.root.viewportNode=v})}#props=user_derived(()=>({id:this.opts.id.current,role:"presentation",[this.root.getBitsAttr("viewport")]:"",style:{position:"relative",flex:1,overflow:"auto"},...this.attachment}));get props(){return get$3(this.#props)}set props(value){set$1(this.#props,value)}}class SelectScrollButtonImplState{opts;content;root;attachment;autoScrollTimer=null;userScrollTimer=-1;isUserScrolling=!1;onAutoScroll=noop$1;#mounted=state$1(!1);get mounted(){return get$3(this.#mounted)}set mounted(value){
+set$1(this.#mounted,value,!0)}constructor(opts,content2){this.opts=opts,this.content=content2,this.root=content2.root,this.attachment=attachRef(opts.ref),watch$1([()=>this.mounted],()=>{if(!this.mounted){this.isUserScrolling=!1;return}this.isUserScrolling}),user_effect(()=>{this.mounted||this.clearAutoScrollInterval()}),this.onpointerdown=this.onpointerdown.bind(this),this.onpointermove=this.onpointermove.bind(this),this.onpointerleave=this.onpointerleave.bind(this)}handleUserScroll(){this.content.
+domContext.clearTimeout(this.userScrollTimer),this.isUserScrolling=!0,this.userScrollTimer=this.content.domContext.setTimeout(()=>{this.isUserScrolling=!1},200)}clearAutoScrollInterval(){this.autoScrollTimer!==null&&(this.content.domContext.clearTimeout(this.autoScrollTimer),this.autoScrollTimer=null)}onpointerdown(_){if(this.autoScrollTimer!==null)return;const autoScroll=tick2=>{this.onAutoScroll(),this.autoScrollTimer=this.content.domContext.setTimeout(()=>autoScroll(tick2+1),this.opts.delay.current(
+tick2))};this.autoScrollTimer=this.content.domContext.setTimeout(()=>autoScroll(1),this.opts.delay.current(0))}onpointermove(e){this.onpointerdown(e)}onpointerleave(_){this.clearAutoScrollInterval()}#props=user_derived(()=>({id:this.opts.id.current,"aria-hidden":boolToStrTrueOrUndef(!0),style:{flexShrink:0},onpointerdown:this.onpointerdown,onpointermove:this.onpointermove,onpointerleave:this.onpointerleave,...this.attachment}));get props(){return get$3(this.#props)}set props(value){set$1(this.#props,
+value)}}class SelectScrollDownButtonState{static create(opts){return new SelectScrollDownButtonState(new SelectScrollButtonImplState(opts,SelectContentContext.get()))}scrollButtonState;content;root;#canScrollDown=state$1(!1);get canScrollDown(){return get$3(this.#canScrollDown)}set canScrollDown(value){set$1(this.#canScrollDown,value,!0)}scrollIntoViewTimer=null;constructor(scrollButtonState){this.scrollButtonState=scrollButtonState,this.content=scrollButtonState.content,this.root=scrollButtonState.
+root,this.scrollButtonState.onAutoScroll=this.handleAutoScroll,watch$1([()=>this.root.viewportNode,()=>this.content.isPositioned],()=>{if(!(!this.root.viewportNode||!this.content.isPositioned))return this.handleScroll(!0),on(this.root.viewportNode,"scroll",()=>this.handleScroll())}),watch$1([()=>this.root.opts.inputValue.current,()=>this.root.viewportNode,()=>this.content.isPositioned],()=>{!this.root.viewportNode||!this.content.isPositioned||this.handleScroll(!0)}),watch$1(()=>this.scrollButtonState.
+mounted,()=>{this.scrollButtonState.mounted&&(this.scrollIntoViewTimer&&clearTimeout(this.scrollIntoViewTimer),this.scrollIntoViewTimer=afterSleep(5,()=>{const activeItem=this.root.highlightedNode;activeItem&&this.root.scrollHighlightedNodeIntoView(activeItem)}))})}handleScroll=(manual=!1)=>{if(manual||this.scrollButtonState.handleUserScroll(),!this.root.viewportNode)return;const maxScroll=this.root.viewportNode.scrollHeight-this.root.viewportNode.clientHeight,paddingTop=Number.parseInt(getComputedStyle(
+this.root.viewportNode).paddingTop,10);this.canScrollDown=Math.ceil(this.root.viewportNode.scrollTop)<maxScroll-paddingTop};handleAutoScroll=()=>{const viewport=this.root.viewportNode,selectedItem=this.root.highlightedNode;!viewport||!selectedItem||(viewport.scrollTop=viewport.scrollTop+selectedItem.offsetHeight)};#props=user_derived(()=>({...this.scrollButtonState.props,[this.root.getBitsAttr("scroll-down-button")]:""}));get props(){return get$3(this.#props)}set props(value){set$1(this.#props,value)}}
+class SelectScrollUpButtonState{static create(opts){return new SelectScrollUpButtonState(new SelectScrollButtonImplState(opts,SelectContentContext.get()))}scrollButtonState;content;root;#canScrollUp=state$1(!1);get canScrollUp(){return get$3(this.#canScrollUp)}set canScrollUp(value){set$1(this.#canScrollUp,value,!0)}constructor(scrollButtonState){this.scrollButtonState=scrollButtonState,this.content=scrollButtonState.content,this.root=scrollButtonState.root,this.scrollButtonState.onAutoScroll=this.
+handleAutoScroll,watch$1([()=>this.root.viewportNode,()=>this.content.isPositioned],()=>{if(!(!this.root.viewportNode||!this.content.isPositioned))return this.handleScroll(!0),on(this.root.viewportNode,"scroll",()=>this.handleScroll())})}handleScroll=(manual=!1)=>{if(manual||this.scrollButtonState.handleUserScroll(),!this.root.viewportNode)return;const paddingTop=Number.parseInt(getComputedStyle(this.root.viewportNode).paddingTop,10);this.canScrollUp=this.root.viewportNode.scrollTop-paddingTop>.1};handleAutoScroll=()=>{
+!this.root.viewportNode||!this.root.highlightedNode||(this.root.viewportNode.scrollTop=this.root.viewportNode.scrollTop-this.root.highlightedNode.offsetHeight)};#props=user_derived(()=>({...this.scrollButtonState.props,[this.root.getBitsAttr("scroll-up-button")]:""}));get props(){return get$3(this.#props)}set props(value){set$1(this.#props,value)}}function Select_hidden_input($$anchor,$$props){push$1($$props,!0);let value=prop($$props,"value",15);const hiddenInputState=SelectHiddenInputState.create(
+{value:boxWith$1(()=>value())});var fragment=comment$2(),node2=first_child(fragment);{var consequent=$$anchor2=>{Hidden_input($$anchor2,spread_props(()=>hiddenInputState.props,{get autocomplete(){return $$props.autocomplete},get value(){return value()},set value($$value){value($$value)}}))};if_block(node2,$$render=>{hiddenInputState.shouldRender&&$$render(consequent)})}append($$anchor,fragment),pop()}function Floating_layer_anchor($$anchor,$$props){push$1($$props,!0);let tooltip=prop($$props,"to\
+oltip",3,!1);FloatingAnchorState.create({id:boxWith$1(()=>$$props.id),virtualEl:boxWith$1(()=>$$props.virtualEl),ref:$$props.ref},tooltip());var fragment=comment$2(),node2=first_child(fragment);snippet(node2,()=>$$props.children??noop$3),append($$anchor,fragment),pop()}var root_4$K=from_svg('<svg viewBox="0 0 30 10" preserveAspectRatio="none" data-arrow=""><polygon points="0,0 30,0 15,10" fill="currentColor"></polygon></svg>'),root_2$1t=from_html("<span><!></span>");function Arrow($$anchor,$$props){
+push$1($$props,!0);let id2=prop($$props,"id",19,useId),width=prop($$props,"width",3,10),height=prop($$props,"height",3,5),restProps=rest_props($$props,["$$slots","$$events","$$legacy","id","children","child","width","height"]);const mergedProps=user_derived(()=>mergeProps(restProps,{id:id2()}));var fragment=comment$2(),node2=first_child(fragment);{var consequent=$$anchor2=>{var fragment_1=comment$2(),node_1=first_child(fragment_1);snippet(node_1,()=>$$props.child,()=>({props:get$3(mergedProps)})),
+append($$anchor2,fragment_1)},alternate_1=$$anchor2=>{var span=root_2$1t();attribute_effect(span,()=>({...get$3(mergedProps)}));var node_2=child(span);{var consequent_1=$$anchor3=>{var fragment_2=comment$2(),node_3=first_child(fragment_2);snippet(node_3,()=>$$props.children??noop$3),append($$anchor3,fragment_2)},alternate=$$anchor3=>{var svg2=root_4$K();template_effect(()=>{set_attribute(svg2,"width",width()),set_attribute(svg2,"height",height())}),append($$anchor3,svg2)};if_block(node_2,$$render=>{
+$$props.children?$$render(consequent_1):$$render(alternate,-1)})}reset(span),append($$anchor2,span)};if_block(node2,$$render=>{$$props.child?$$render(consequent):$$render(alternate_1,-1)})}append($$anchor,fragment),pop()}function Floating_layer_arrow($$anchor,$$props){push$1($$props,!0);let id2=prop($$props,"id",19,useId),ref2=prop($$props,"ref",15,null),restProps=rest_props($$props,["$$slots","$$events","$$legacy","id","ref"]);const arrowState=FloatingArrowState.create({id:boxWith$1(()=>id2()),
+ref:boxWith$1(()=>ref2(),v=>ref2(v))}),mergedProps=user_derived(()=>mergeProps(restProps,arrowState.props));Arrow($$anchor,spread_props(()=>get$3(mergedProps))),pop()}function Floating_layer_content($$anchor,$$props){push$1($$props,!0);let side=prop($$props,"side",3,"bottom"),sideOffset=prop($$props,"sideOffset",3,0),align=prop($$props,"align",3,"center"),alignOffset=prop($$props,"alignOffset",3,0),arrowPadding=prop($$props,"arrowPadding",3,0),avoidCollisions=prop($$props,"avoidCollisions",3,!0),
+collisionBoundary=prop($$props,"collisionBoundary",19,()=>[]),collisionPadding=prop($$props,"collisionPadding",3,0),hideWhenDetached=prop($$props,"hideWhenDetached",3,!1),onPlaced=prop($$props,"onPlaced",3,()=>{}),sticky=prop($$props,"sticky",3,"partial"),updatePositionStrategy=prop($$props,"updatePositionStrategy",3,"optimized"),strategy=prop($$props,"strategy",3,"fixed"),dir=prop($$props,"dir",3,"ltr"),style2=prop($$props,"style",19,()=>({})),wrapperId=prop($$props,"wrapperId",19,useId),customAnchor=prop(
+$$props,"customAnchor",3,null),tooltip=prop($$props,"tooltip",3,!1);const contentState=FloatingContentState.create({side:boxWith$1(()=>side()),sideOffset:boxWith$1(()=>sideOffset()),align:boxWith$1(()=>align()),alignOffset:boxWith$1(()=>alignOffset()),id:boxWith$1(()=>$$props.id),arrowPadding:boxWith$1(()=>arrowPadding()),avoidCollisions:boxWith$1(()=>avoidCollisions()),collisionBoundary:boxWith$1(()=>collisionBoundary()),collisionPadding:boxWith$1(()=>collisionPadding()),hideWhenDetached:boxWith$1(
+()=>hideWhenDetached()),onPlaced:boxWith$1(()=>onPlaced()),sticky:boxWith$1(()=>sticky()),updatePositionStrategy:boxWith$1(()=>updatePositionStrategy()),strategy:boxWith$1(()=>strategy()),dir:boxWith$1(()=>dir()),style:boxWith$1(()=>style2()),enabled:boxWith$1(()=>$$props.enabled),wrapperId:boxWith$1(()=>wrapperId()),customAnchor:boxWith$1(()=>customAnchor())},tooltip()),mergedProps=user_derived(()=>mergeProps(contentState.wrapperProps,{style:{pointerEvents:"auto"}}));var fragment=comment$2(),node2=first_child(
+fragment);snippet(node2,()=>$$props.content??noop$3,()=>({props:contentState.props,wrapperProps:get$3(mergedProps)})),append($$anchor,fragment),pop()}function Floating_layer_content_static($$anchor,$$props){push$1($$props,!0),onMount$1(()=>{$$props.onPlaced?.()});var fragment=comment$2(),node2=first_child(fragment);snippet(node2,()=>$$props.content??noop$3,()=>({props:{},wrapperProps:{}})),append($$anchor,fragment),pop()}function Popper_content($$anchor,$$props){let isStatic=prop($$props,"isStat\
+ic",3,!1),restProps=rest_props($$props,["$$slots","$$events","$$legacy","content","isStatic","onPlaced"]);var fragment=comment$2(),node2=first_child(fragment);{var consequent=$$anchor2=>{Floating_layer_content_static($$anchor2,{get content(){return $$props.content},get onPlaced(){return $$props.onPlaced}})},alternate=$$anchor2=>{Floating_layer_content($$anchor2,spread_props({get content(){return $$props.content},get onPlaced(){return $$props.onPlaced}},()=>restProps))};if_block(node2,$$render=>{
+isStatic()?$$render(consequent):$$render(alternate,-1)})}append($$anchor,fragment)}var root_1$19=from_html("<!> <!>",1);function Popper_layer_inner($$anchor,$$props){push$1($$props,!0);let interactOutsideBehavior=prop($$props,"interactOutsideBehavior",3,"close"),trapFocus=prop($$props,"trapFocus",3,!0),isValidEvent2=prop($$props,"isValidEvent",3,()=>!1),customAnchor=prop($$props,"customAnchor",3,null),isStatic=prop($$props,"isStatic",3,!1),tooltip=prop($$props,"tooltip",3,!1),contentPointerEvents=prop(
+$$props,"contentPointerEvents",3,"auto"),restProps=rest_props($$props,["$$slots","$$events","$$legacy","popper","onEscapeKeydown","escapeKeydownBehavior","preventOverflowTextSelection","id","onPointerDown","onPointerUp","side","sideOffset","align","alignOffset","arrowPadding","avoidCollisions","collisionBoundary","collisionPadding","sticky","hideWhenDetached","updatePositionStrategy","strategy","dir","preventScroll","wrapperId","style","onPlaced","onInteractOutside","onCloseAutoFocus","onOpenAut\
+oFocus","onFocusOutside","interactOutsideBehavior","loop","trapFocus","isValidEvent","customAnchor","isStatic","enabled","ref","tooltip","contentPointerEvents"]);const resolvedPreventScroll=user_derived(()=>$$props.preventScroll??!0),effectiveStrategy=user_derived(()=>$$props.strategy??(get$3(resolvedPreventScroll)?"fixed":"absolute"));Popper_content($$anchor,{get isStatic(){return isStatic()},get id(){return $$props.id},get side(){return $$props.side},get sideOffset(){return $$props.sideOffset},
+get align(){return $$props.align},get alignOffset(){return $$props.alignOffset},get arrowPadding(){return $$props.arrowPadding},get avoidCollisions(){return $$props.avoidCollisions},get collisionBoundary(){return $$props.collisionBoundary},get collisionPadding(){return $$props.collisionPadding},get sticky(){return $$props.sticky},get hideWhenDetached(){return $$props.hideWhenDetached},get updatePositionStrategy(){return $$props.updatePositionStrategy},get strategy(){return get$3(effectiveStrategy)},
+get dir(){return $$props.dir},get wrapperId(){return $$props.wrapperId},get style(){return $$props.style},get onPlaced(){return $$props.onPlaced},get customAnchor(){return customAnchor()},get enabled(){return $$props.enabled},get tooltip(){return tooltip()},content:($$anchor2,$$arg0)=>{let floatingProps=()=>$$arg0?.().props,wrapperProps=()=>$$arg0?.().wrapperProps;var fragment_1=root_1$19(),node2=first_child(fragment_1);{var consequent=$$anchor3=>{Scroll_lock($$anchor3,{get preventScroll(){return get$3(
+resolvedPreventScroll)}})},consequent_1=$$anchor3=>{Scroll_lock($$anchor3,{get preventScroll(){return get$3(resolvedPreventScroll)}})};if_block(node2,$$render=>{$$props.forceMount&&$$props.enabled?$$render(consequent):$$props.forceMount||$$render(consequent_1,1)})}var node_1=sibling(node2,2);Focus_scope(node_1,{get onOpenAutoFocus(){return $$props.onOpenAutoFocus},get onCloseAutoFocus(){return $$props.onCloseAutoFocus},get loop(){return $$props.loop},get enabled(){return $$props.enabled},get trapFocus(){
+return trapFocus()},get forceMount(){return $$props.forceMount},get ref(){return $$props.ref},focusScope:($$anchor3,$$arg02)=>{let focusScopeProps=()=>$$arg02?.().props;Escape_layer($$anchor3,{get onEscapeKeydown(){return $$props.onEscapeKeydown},get escapeKeydownBehavior(){return $$props.escapeKeydownBehavior},get enabled(){return $$props.enabled},get ref(){return $$props.ref},children:($$anchor4,$$slotProps)=>{Dismissible_layer($$anchor4,{get id(){return $$props.id},get onInteractOutside(){return $$props.
+onInteractOutside},get onFocusOutside(){return $$props.onFocusOutside},get interactOutsideBehavior(){return interactOutsideBehavior()},get isValidEvent(){return isValidEvent2()},get enabled(){return $$props.enabled},get ref(){return $$props.ref},children:($$anchor5,$$arg03)=>{let dismissibleProps=()=>$$arg03?.().props;Text_selection_layer($$anchor5,{get id(){return $$props.id},get preventOverflowTextSelection(){return $$props.preventOverflowTextSelection},get onPointerDown(){return $$props.onPointerDown},
+get onPointerUp(){return $$props.onPointerUp},get enabled(){return $$props.enabled},get ref(){return $$props.ref},children:($$anchor6,$$slotProps2)=>{var fragment_7=comment$2(),node_2=first_child(fragment_7);{let $0=user_derived(()=>({props:mergeProps(restProps,floatingProps(),dismissibleProps(),focusScopeProps(),{style:{pointerEvents:contentPointerEvents()}}),wrapperProps:wrapperProps()}));snippet(node_2,()=>$$props.popper??noop$3,()=>get$3($0))}append($$anchor6,fragment_7)},$$slots:{default:!0}})},
+$$slots:{default:!0}})},$$slots:{default:!0}})},$$slots:{focusScope:!0}}),append($$anchor2,fragment_1)},$$slots:{content:!0}}),pop()}function Popper_layer($$anchor,$$props){let interactOutsideBehavior=prop($$props,"interactOutsideBehavior",3,"close"),trapFocus=prop($$props,"trapFocus",3,!0),isValidEvent2=prop($$props,"isValidEvent",3,()=>!1),customAnchor=prop($$props,"customAnchor",3,null),isStatic=prop($$props,"isStatic",3,!1),restProps=rest_props($$props,["$$slots","$$events","$$legacy","poppe\
+r","open","onEscapeKeydown","escapeKeydownBehavior","preventOverflowTextSelection","id","onPointerDown","onPointerUp","side","sideOffset","align","alignOffset","arrowPadding","avoidCollisions","collisionBoundary","collisionPadding","sticky","hideWhenDetached","updatePositionStrategy","strategy","dir","preventScroll","wrapperId","style","onPlaced","onInteractOutside","onCloseAutoFocus","onOpenAutoFocus","onFocusOutside","interactOutsideBehavior","loop","trapFocus","isValidEvent","customAnchor","i\
+sStatic","ref","shouldRender"]);var fragment=comment$2(),node2=first_child(fragment);{var consequent=$$anchor2=>{Popper_layer_inner($$anchor2,spread_props({get popper(){return $$props.popper},get onEscapeKeydown(){return $$props.onEscapeKeydown},get escapeKeydownBehavior(){return $$props.escapeKeydownBehavior},get preventOverflowTextSelection(){return $$props.preventOverflowTextSelection},get id(){return $$props.id},get onPointerDown(){return $$props.onPointerDown},get onPointerUp(){return $$props.
+onPointerUp},get side(){return $$props.side},get sideOffset(){return $$props.sideOffset},get align(){return $$props.align},get alignOffset(){return $$props.alignOffset},get arrowPadding(){return $$props.arrowPadding},get avoidCollisions(){return $$props.avoidCollisions},get collisionBoundary(){return $$props.collisionBoundary},get collisionPadding(){return $$props.collisionPadding},get sticky(){return $$props.sticky},get hideWhenDetached(){return $$props.hideWhenDetached},get updatePositionStrategy(){
 return $$props.updatePositionStrategy},get strategy(){return $$props.strategy},get dir(){return $$props.dir},get preventScroll(){return $$props.preventScroll},get wrapperId(){return $$props.wrapperId},get style(){return $$props.style},get onPlaced(){return $$props.onPlaced},get customAnchor(){return customAnchor()},get isStatic(){return isStatic()},get enabled(){return $$props.open},get onInteractOutside(){return $$props.onInteractOutside},get onCloseAutoFocus(){return $$props.onCloseAutoFocus},
 get onOpenAutoFocus(){return $$props.onOpenAutoFocus},get interactOutsideBehavior(){return interactOutsideBehavior()},get loop(){return $$props.loop},get trapFocus(){return trapFocus()},get isValidEvent(){return isValidEvent2()},get onFocusOutside(){return $$props.onFocusOutside},forceMount:!1,get ref(){return $$props.ref}},()=>restProps))};if_block(node2,$$render=>{$$props.shouldRender&&$$render(consequent)})}append($$anchor,fragment)}function Popper_layer_force_mount($$anchor,$$props){let interactOutsideBehavior=prop(
 $$props,"interactOutsideBehavior",3,"close"),trapFocus=prop($$props,"trapFocus",3,!0),isValidEvent2=prop($$props,"isValidEvent",3,()=>!1),customAnchor=prop($$props,"customAnchor",3,null),isStatic=prop($$props,"isStatic",3,!1),restProps=rest_props($$props,["$$slots","$$events","$$legacy","popper","onEscapeKeydown","escapeKeydownBehavior","preventOverflowTextSelection","id","onPointerDown","onPointerUp","side","sideOffset","align","alignOffset","arrowPadding","avoidCollisions","collisionBoundary",
@@ -1324,120 +1325,121 @@ Disable automatic scrolling while messages stream so you can control the viewpor
 ient settings section to edit.",mcpServerUsageStats:"Usage statistics for MCP servers. Tracks how many times tools from each server have been used.",agenticMaxTurns:"Maximum number of tool execution cycles before stopping (prevents infinite loops).",agenticMaxToolPreviewLines:"Number of lines shown in tool output previews (last N lines). Only these previews and the final LLM response persist after the agentic loop completes.",showToolCallInProgress:"Automatically expand tool call details while e\
 xecuting and keep them expanded after completion.",pyInterpreterEnabled:"Enable Python interpreter using Pyodide. Allows running Python code in markdown code blocks.",preEncodeConversation:"After each response, re-submit the conversation to pre-fill the server KV cache. Makes the next turn faster since the prompt is already encoded while you read the response.",enableContinueGeneration:'Enable "Continue" button for assistant messages. Currently works only with non-reasoning models.'},SETTINGS_COLOR_MODES_CONFIG=[
 {value:ColorMode.SYSTEM,label:"System",icon:Monitor},{value:ColorMode.LIGHT,label:"Light",icon:Sun},{value:ColorMode.DARK,label:"Dark",icon:Moon}],NUMERIC_FIELDS=["temperature","top_k","top_p","min_p","max_tokens","pasteLongTextToFileLen","dynatemp_range","dynatemp_exponent","typ_p","xtc_probability","xtc_threshold","repeat_last_n","repeat_penalty","presence_penalty","frequency_penalty","dry_multiplier","dry_base","dry_allowed_length","dry_penalty_last_n","agenticMaxTurns","agenticMaxToolPreview\
-Lines"],POSITIVE_INTEGER_FIELDS=["agenticMaxTurns","agenticMaxToolPreviewLines"],SETTINGS_KEYS={THEME:"theme",API_KEY:"apiKey",SYSTEM_MESSAGE:"systemMessage",PASTE_LONG_TEXT_TO_FILE_LEN:"pasteLongTextToFileLen",COPY_TEXT_ATTACHMENTS_AS_PLAIN_TEXT:"copyTextAttachmentsAsPlainText",ENABLE_CONTINUE_GENERATION:"enableContinueGeneration",PDF_AS_IMAGE:"pdfAsImage",ASK_FOR_TITLE_CONFIRMATION:"askForTitleConfirmation",SHOW_MESSAGE_STATS:"showMessageStats",SHOW_THOUGHT_IN_PROGRESS:"showThoughtInProgress",
-KEEP_STATS_VISIBLE:"keepStatsVisible",AUTO_MIC_ON_EMPTY:"autoMicOnEmpty",RENDER_USER_CONTENT_AS_MARKDOWN:"renderUserContentAsMarkdown",DISABLE_AUTO_SCROLL:"disableAutoScroll",ALWAYS_SHOW_SIDEBAR_ON_DESKTOP:"alwaysShowSidebarOnDesktop",FULL_HEIGHT_CODE_BLOCKS:"fullHeightCodeBlocks",SHOW_RAW_MODEL_NAMES:"showRawModelNames",TEMPERATURE:"temperature",DYNATEMP_RANGE:"dynatemp_range",DYNATEMP_EXPONENT:"dynatemp_exponent",TOP_K:"top_k",TOP_P:"top_p",MIN_P:"min_p",XTC_PROBABILITY:"xtc_probability",XTC_THRESHOLD:"\
-xtc_threshold",TYP_P:"typ_p",MAX_TOKENS:"max_tokens",SAMPLERS:"samplers",BACKEND_SAMPLING:"backend_sampling",REPEAT_LAST_N:"repeat_last_n",REPEAT_PENALTY:"repeat_penalty",PRESENCE_PENALTY:"presence_penalty",FREQUENCY_PENALTY:"frequency_penalty",DRY_MULTIPLIER:"dry_multiplier",DRY_BASE:"dry_base",DRY_ALLOWED_LENGTH:"dry_allowed_length",DRY_PENALTY_LAST_N:"dry_penalty_last_n",AGENTIC_MAX_TURNS:"agenticMaxTurns",ALWAYS_SHOW_AGENTIC_TURNS:"alwaysShowAgenticTurns",AGENTIC_MAX_TOOL_PREVIEW_LINES:"agen\
-ticMaxToolPreviewLines",SHOW_TOOL_CALL_IN_PROGRESS:"showToolCallInProgress",DISABLE_REASONING_PARSING:"disableReasoningParsing",EXCLUDE_REASONING_FROM_CONTEXT:"excludeReasoningFromContext",SHOW_RAW_OUTPUT_SWITCH:"showRawOutputSwitch",CUSTOM:"custom"},SETTINGS_SECTION_TITLES={GENERAL:"General",DISPLAY:"Display",SAMPLING:"Sampling",PENALTIES:"Penalties",AGENTIC:"Agentic",TOOLS:"Tools",IMPORT_EXPORT:"Import/Export",DEVELOPER:"Developer"},SETTINGS_CHAT_SECTIONS=[{title:SETTINGS_SECTION_TITLES.GENERAL,
-slug:"general",icon:Sliders_vertical,fields:[{key:SETTINGS_KEYS.THEME,label:"Theme",type:SettingsFieldType.SELECT,options:SETTINGS_COLOR_MODES_CONFIG},{key:SETTINGS_KEYS.API_KEY,label:"API Key",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.SYSTEM_MESSAGE,label:"System Message",type:SettingsFieldType.TEXTAREA},{key:SETTINGS_KEYS.PASTE_LONG_TEXT_TO_FILE_LEN,label:"Paste long text to file length",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.COPY_TEXT_ATTACHMENTS_AS_PLAIN_TEXT,label:"Copy tex\
-t attachments as plain text",type:SettingsFieldType.CHECKBOX},{key:SETTINGS_KEYS.ENABLE_CONTINUE_GENERATION,label:'Enable "Continue" button',type:SettingsFieldType.CHECKBOX,isExperimental:!0},{key:SETTINGS_KEYS.PDF_AS_IMAGE,label:"Parse PDF as image",type:SettingsFieldType.CHECKBOX},{key:SETTINGS_KEYS.ASK_FOR_TITLE_CONFIRMATION,label:"Ask for confirmation before changing conversation title",type:SettingsFieldType.CHECKBOX}]},{title:SETTINGS_SECTION_TITLES.DISPLAY,slug:"display",icon:Monitor,fields:[
-{key:SETTINGS_KEYS.SHOW_MESSAGE_STATS,label:"Show message generation statistics",type:SettingsFieldType.CHECKBOX},{key:SETTINGS_KEYS.SHOW_THOUGHT_IN_PROGRESS,label:"Show thought in progress",type:SettingsFieldType.CHECKBOX},{key:SETTINGS_KEYS.SHOW_TOOL_CALL_IN_PROGRESS,label:"Show tool call in progress",type:SettingsFieldType.CHECKBOX},{key:SETTINGS_KEYS.KEEP_STATS_VISIBLE,label:"Keep stats visible after generation",type:SettingsFieldType.CHECKBOX},{key:SETTINGS_KEYS.AUTO_MIC_ON_EMPTY,label:"Sho\
-w microphone on empty input",type:SettingsFieldType.CHECKBOX,isExperimental:!0},{key:SETTINGS_KEYS.RENDER_USER_CONTENT_AS_MARKDOWN,label:"Render user content as Markdown",type:SettingsFieldType.CHECKBOX},{key:SETTINGS_KEYS.FULL_HEIGHT_CODE_BLOCKS,label:"Use full height code blocks",type:SettingsFieldType.CHECKBOX},{key:SETTINGS_KEYS.DISABLE_AUTO_SCROLL,label:"Disable automatic scroll",type:SettingsFieldType.CHECKBOX},{key:SETTINGS_KEYS.ALWAYS_SHOW_SIDEBAR_ON_DESKTOP,label:"Always show sidebar on\
- desktop",type:SettingsFieldType.CHECKBOX},{key:SETTINGS_KEYS.SHOW_RAW_MODEL_NAMES,label:"Show raw model names",type:SettingsFieldType.CHECKBOX},{key:SETTINGS_KEYS.ALWAYS_SHOW_AGENTIC_TURNS,label:"Always show agentic turns in conversation",type:SettingsFieldType.CHECKBOX}]},{title:SETTINGS_SECTION_TITLES.SAMPLING,slug:"sampling",icon:Funnel,fields:[{key:SETTINGS_KEYS.TEMPERATURE,label:"Temperature",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.DYNATEMP_RANGE,label:"Dynamic temperature range",
-type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.DYNATEMP_EXPONENT,label:"Dynamic temperature exponent",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.TOP_K,label:"Top K",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.TOP_P,label:"Top P",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.MIN_P,label:"Min P",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.XTC_PROBABILITY,label:"XTC probability",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.XTC_THRESHOLD,label:"XTC threshold",type:SettingsFieldType.
-INPUT},{key:SETTINGS_KEYS.TYP_P,label:"Typical P",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.MAX_TOKENS,label:"Max tokens",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.SAMPLERS,label:"Samplers",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.BACKEND_SAMPLING,label:"Backend sampling",type:SettingsFieldType.CHECKBOX}]},{title:SETTINGS_SECTION_TITLES.PENALTIES,slug:"penalties",icon:Triangle_alert,fields:[{key:SETTINGS_KEYS.REPEAT_LAST_N,label:"Repeat last N",type:SettingsFieldType.INPUT},
-{key:SETTINGS_KEYS.REPEAT_PENALTY,label:"Repeat penalty",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.PRESENCE_PENALTY,label:"Presence penalty",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.FREQUENCY_PENALTY,label:"Frequency penalty",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.DRY_MULTIPLIER,label:"DRY multiplier",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.DRY_BASE,label:"DRY base",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.DRY_ALLOWED_LENGTH,label:"DRY allowed length",type:SettingsFieldType.
-INPUT},{key:SETTINGS_KEYS.DRY_PENALTY_LAST_N,label:"DRY penalty last N",type:SettingsFieldType.INPUT}]},{title:SETTINGS_SECTION_TITLES.AGENTIC,slug:"agentic",icon:List_restart,fields:[{key:SETTINGS_KEYS.AGENTIC_MAX_TURNS,label:"Agentic turns",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.AGENTIC_MAX_TOOL_PREVIEW_LINES,label:"Max lines per tool preview",type:SettingsFieldType.INPUT}]},{title:SETTINGS_SECTION_TITLES.TOOLS,slug:"tools",icon:Pencil_ruler},{title:SETTINGS_SECTION_TITLES.IMPORT_EXPORT,
-slug:"import-export",icon:Database},{title:SETTINGS_SECTION_TITLES.DEVELOPER,slug:"developer",icon:Code,fields:[{key:SETTINGS_KEYS.DISABLE_REASONING_PARSING,label:"Disable reasoning content parsing",type:SettingsFieldType.CHECKBOX},{key:SETTINGS_KEYS.EXCLUDE_REASONING_FROM_CONTEXT,label:"Exclude reasoning from context",type:SettingsFieldType.CHECKBOX},{key:SETTINGS_KEYS.SHOW_RAW_OUTPUT_SWITCH,label:"Enable raw output toggle",type:SettingsFieldType.CHECKBOX},{key:SETTINGS_KEYS.CUSTOM,label:"Custo\
-m JSON",type:SettingsFieldType.TEXTAREA}]}];FileTypeAudio.MP3+"",FileExtensionAudio.MP3,MimeTypeAudio.MP3_MPEG,MimeTypeAudio.MP3,FileTypeAudio.WAV+"",FileExtensionAudio.WAV,MimeTypeAudio.WAV;FileTypeImage.JPEG+"",FileExtensionImage.JPG,FileExtensionImage.JPEG,MimeTypeImage.JPEG,FileTypeImage.PNG+"",FileExtensionImage.PNG,MimeTypeImage.PNG,FileTypeImage.GIF+"",FileExtensionImage.GIF,MimeTypeImage.GIF,FileTypeImage.WEBP+"",FileExtensionImage.WEBP,MimeTypeImage.WEBP,FileTypeImage.SVG+"",FileExtensionImage.
-SVG,MimeTypeImage.SVG;FileTypePdf.PDF+"",FileExtensionPdf.PDF,MimeTypeApplication.PDF;FileTypeText.PLAIN_TEXT+"",FileExtensionText.TXT,MimeTypeText.PLAIN,FileTypeText.MARKDOWN+"",FileExtensionText.MD,MimeTypeText.MARKDOWN,FileTypeText.ASCIIDOC+"",FileExtensionText.ADOC,MimeTypeText.ASCIIDOC,FileTypeText.JAVASCRIPT+"",FileExtensionText.JS,MimeTypeText.JAVASCRIPT,MimeTypeText.JAVASCRIPT_APP,FileTypeText.TYPESCRIPT+"",FileExtensionText.TS,MimeTypeText.TYPESCRIPT,FileTypeText.JSX+"",FileExtensionText.
-JSX,MimeTypeText.JSX,FileTypeText.TSX+"",FileExtensionText.TSX,MimeTypeText.TSX,FileTypeText.CSS+"",FileExtensionText.CSS,MimeTypeText.CSS,FileTypeText.HTML+"",FileExtensionText.HTML,FileExtensionText.HTM,MimeTypeText.HTML,FileTypeText.JSON+"",FileExtensionText.JSON,MimeTypeText.JSON,FileTypeText.XML+"",FileExtensionText.XML,MimeTypeText.XML_TEXT,MimeTypeText.XML_APP,FileTypeText.YAML+"",FileExtensionText.YAML,FileExtensionText.YML,MimeTypeText.YAML_TEXT,MimeTypeText.YAML_APP,FileTypeText.CSV+"",
-FileExtensionText.CSV,MimeTypeText.CSV,FileTypeText.LOG+"",FileExtensionText.LOG,MimeTypeText.PLAIN,FileTypeText.PYTHON+"",FileExtensionText.PY,MimeTypeText.PYTHON,FileTypeText.JAVA+"",FileExtensionText.JAVA,MimeTypeText.JAVA,FileTypeText.CPP+"",FileExtensionText.CPP,FileExtensionText.C,FileExtensionText.H,FileExtensionText.HPP,MimeTypeText.CPP_SRC,MimeTypeText.CPP_HDR,MimeTypeText.C_SRC,MimeTypeText.C_HDR,FileTypeText.PHP+"",FileExtensionText.PHP,MimeTypeText.PHP,FileTypeText.RUBY+"",FileExtensionText.
-RB,MimeTypeText.RUBY,FileTypeText.GO+"",FileExtensionText.GO,MimeTypeText.GO,FileTypeText.RUST+"",FileExtensionText.RS,MimeTypeText.RUST,FileTypeText.SHELL+"",FileExtensionText.SH,FileExtensionText.BAT,MimeTypeText.SHELL,MimeTypeText.BAT,FileTypeText.SQL+"",FileExtensionText.SQL,MimeTypeText.SQL,FileTypeText.R+"",FileExtensionText.R,MimeTypeText.R,FileTypeText.SCALA+"",FileExtensionText.SCALA,MimeTypeText.SCALA,FileTypeText.KOTLIN+"",FileExtensionText.KT,MimeTypeText.KOTLIN,FileTypeText.SWIFT+"",
-FileExtensionText.SWIFT,MimeTypeText.SWIFT,FileTypeText.DART+"",FileExtensionText.DART,MimeTypeText.DART,FileTypeText.VUE+"",FileExtensionText.VUE,MimeTypeText.VUE,FileTypeText.SVELTE+"",FileExtensionText.SVELTE,MimeTypeText.SVELTE,FileTypeText.LATEX+"",FileExtensionText.TEX,MimeTypeText.LATEX,MimeTypeText.TEX,MimeTypeText.TEX_APP,FileTypeText.BIBTEX+"",FileExtensionText.BIB,MimeTypeText.BIBTEX,FileTypeText.CUDA+"",FileExtensionText.CU,FileExtensionText.CUH,MimeTypeText.CUDA,FileTypeText.VULKAN+
-"",FileExtensionText.COMP,MimeTypeText.PLAIN,FileTypeText.HASKELL+"",FileExtensionText.HS,MimeTypeText.HASKELL,FileTypeText.CSHARP+"",FileExtensionText.CS,MimeTypeText.CSHARP,FileTypeText.PROPERTIES+"",FileExtensionText.PROPERTIES,MimeTypeText.PROPERTIES;const BR_PATTERN=/<br\s*\/?\s*>/gi,LIST_PATTERN=/^<ul>([\s\S]*)<\/ul>$/i,LI_PATTERN=/<li>([\s\S]*?)<\/li>/gi,TOOL_GROUP_LABELS={[ToolSource.BUILTIN]:"Built-in",[ToolSource.CUSTOM]:"JSON Schema"},TOOL_SERVER_LABELS={[ToolSource.BUILTIN]:"Built-in\
- Tools",[ToolSource.CUSTOM]:"Custom Tools"},TOOLTIP_DELAY_DURATION=500;var root$1R=from_svg('<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 174 174" xmlns:xlink="http://www.w3.org/1999/xlink" fill="none" version="1.1"><g id="shape-320b5b95-d08d-8089-8007-585a8e498184"><defs><clipPath id="frame-clip-320b5b95-d08d-8089-8007-585a8e498184-render-1" class="frame-clip frame-clip-def"><rect rx="0" ry="0" x="0" y="0" width="174.00000000000045" height="174" transform="matrix(1.000000, 0.000000, 0.0\
-00000, 1.000000, 0.000000, 0.000000)"></rect></clipPath></defs><g class="frame-container-wrapper"><g class="frame-container-blur"><g class="frame-container-shadows"><g clip-path="url(#frame-clip-320b5b95-d08d-8089-8007-585a8e498184-render-1)" fill="none"><g class="fills" id="fills-320b5b95-d08d-8089-8007-585a8e498184"><rect rx="0" ry="0" x="0" y="0" width="174.00000000000045" height="174" transform="matrix(1.000000, 0.000000, 0.000000, 1.000000, 0.000000, 0.000000)" class="frame-background"></re\
-ct></g><g class="frame-children"><g id="shape-320b5b95-d08d-8089-8007-585a974337b1"><g class="fills" id="fills-320b5b95-d08d-8089-8007-585a974337b1"><path d="M15.5587158203125,81.5927734375L83.44091796875,13.7105712890625C92.813720703125,4.3380126953125,108.0096435546875,4.3380126953125,117.3817138671875,13.7105712890625L117.3817138671875,13.7105712890625C126.7547607421875,23.08306884765625,126.7547607421875,38.27911376953125,117.3817138671875,47.65167236328125L66.1168212890625,98.9169921875" fi\
-ll="none" stroke-linecap="round" style="fill: none;"></path></g><g fill="none" stroke-linecap="round" id="strokes-b954dcef-3e3e-8015-8007-585acd4382b6-320b5b95-d08d-8089-8007-585a974337b1" class="strokes"><g class="stroke-shape"><path d="M15.5587158203125,81.5927734375L83.44091796875,13.7105712890625C92.813720703125,4.3380126953125,108.0096435546875,4.3380126953125,117.3817138671875,13.7105712890625L117.3817138671875,13.7105712890625C126.7547607421875,23.08306884765625,126.7547607421875,38.27911\
-376953125,117.3817138671875,47.65167236328125L66.1168212890625,98.9169921875" style="fill: none; stroke-width: 12; stroke: currentColor; stroke-opacity: 1;"></path></g></g></g><g id="shape-320b5b95-d08d-8089-8007-585a974337b2"><g class="fills" id="fills-320b5b95-d08d-8089-8007-585a974337b2"><path d="M66.5587158203125,98.26885986328125L117.1165771484375,47.7105712890625C126.489501953125,38.3380126953125,141.6854248046875,38.3380126953125,151.0584716796875,47.7105712890625L151.4114990234375,48.064\
-0869140625C160.7845458984375,57.43670654296875,160.7845458984375,72.6326904296875,151.4114990234375,82.00518798828125L90.018310546875,143.39886474609375C86.8941650390625,146.52288818359375,86.8941650390625,151.587890625,90.018310546875,154.71185302734375L102.62451171875,167.31890869140625" fill="none" stroke-linecap="round" style="fill: none;"></path></g><g fill="none" stroke-linecap="round" id="strokes-b954dcef-3e3e-8015-8007-585acd447743-320b5b95-d08d-8089-8007-585a974337b2" class="strokes"><g\
- class="stroke-shape"><path d="M66.5587158203125,98.26885986328125L117.1165771484375,47.7105712890625C126.489501953125,38.3380126953125,141.6854248046875,38.3380126953125,151.0584716796875,47.7105712890625L151.4114990234375,48.0640869140625C160.7845458984375,57.43670654296875,160.7845458984375,72.6326904296875,151.4114990234375,82.00518798828125L90.018310546875,143.39886474609375C86.8941650390625,146.52288818359375,86.8941650390625,151.587890625,90.018310546875,154.71185302734375L102.62451171875\
-,167.31890869140625" style="fill: none; stroke-width: 12; stroke: currentColor; stroke-opacity: 1;"></path></g></g></g><g id="shape-320b5b95-d08d-8089-8007-585a974337b3"><g class="fills" id="fills-320b5b95-d08d-8089-8007-585a974337b3"><path d="M99.79296875,30.68115234375L49.588134765625,80.8857421875C40.215576171875,90.258056640625,40.215576171875,105.45404052734375,49.588134765625,114.82708740234375L49.588134765625,114.82708740234375C58.9608154296875,124.19903564453125,74.1566162109375,124.1990\
-3564453125,83.529296875,114.82708740234375L133.7340087890625,64.62225341796875" fill="none" stroke-linecap="round" style="fill: none;"></path></g><g fill="none" stroke-linecap="round" id="strokes-b954dcef-3e3e-8015-8007-585acd44c5c9-320b5b95-d08d-8089-8007-585a974337b3" class="strokes"><g class="stroke-shape"><path d="M99.79296875,30.68115234375L49.588134765625,80.8857421875C40.215576171875,90.258056640625,40.215576171875,105.45404052734375,49.588134765625,114.82708740234375L49.588134765625,114.\
-82708740234375C58.9608154296875,124.19903564453125,74.1566162109375,124.19903564453125,83.529296875,114.82708740234375L133.7340087890625,64.62225341796875" style="fill: none; stroke-width: 12; stroke: currentColor; stroke-opacity: 1;"></path></g></g></g></g></g></g></g></g></g></svg>');function McpLogo($$anchor,$$props){let className=prop($$props,"class",3,""),style2=prop($$props,"style",3,"");var svg2=root$1R();template_effect(()=>{set_class(svg2,0,clsx(className())),set_style(svg2,style2())}),
-append($$anchor,svg2)}const FORK_TREE_DEPTH_PADDING=8,SYSTEM_MESSAGE_PLACEHOLDER="System message",APP_NAME="llama.cpp",ICON_STRIP_TRANSITION_DURATION=150,ICON_STRIP_TRANSITION_DELAY_MULTIPLIER=50,SIDEBAR_ACTIONS_ITEMS=[{icon:Square_pen,tooltip:"New chat",route:"?new_chat=true#/",keys:["shift","cmd","o"]},{icon:Search,tooltip:"Search",keys:["cmd","k"]},{icon:McpLogo,tooltip:"MCP Servers",route:"#/settings/mcp",activeRouteId:"/settings/mcp"},{icon:Settings$1,tooltip:"Settings",route:"#/settings/ch\
-at/general",activeRoutePrefix:"/settings/chat"}],URI_SCHEME_SEPARATOR="://",TEMPLATE_EXPRESSION_REGEX=/\{([+#./;?&]?)([^}]+)\}/g,URI_TEMPLATE_OPERATORS={RESERVED:"+",FRAGMENT:"#",PATH_SEGMENT:"/",LABEL:".",PATH_PARAM:";",FORM_QUERY:"?",FORM_CONTINUATION:"&"},URI_TEMPLATE_SEPARATORS={COMMA:",",SLASH:"/",PERIOD:".",SEMICOLON:";",QUERY_PREFIX:"?",QUERY_CONTINUATION:"&"},VARIABLE_EXPLODE_MODIFIER_REGEX=/[*]$/,VARIABLE_PREFIX_MODIFIER_REGEX=/:[\d]+$/,LEADING_SLASHES_REGEX=/^\/+/,DEFAULT_MOBILE_BREAKPOINT=768;
-class IsMobile extends MediaQuery{constructor(breakpoint=DEFAULT_MOBILE_BREAKPOINT){super(`max-width: ${breakpoint-1}px`)}}const SYNCABLE_PARAMETERS=[{key:"temperature",serverKey:"temperature",type:SyncableParameterType.NUMBER,canSync:!0},{key:"top_k",serverKey:"top_k",type:SyncableParameterType.NUMBER,canSync:!0},{key:"top_p",serverKey:"top_p",type:SyncableParameterType.NUMBER,canSync:!0},{key:"min_p",serverKey:"min_p",type:SyncableParameterType.NUMBER,canSync:!0},{key:"dynatemp_range",serverKey:"\
-dynatemp_range",type:SyncableParameterType.NUMBER,canSync:!0},{key:"dynatemp_exponent",serverKey:"dynatemp_exponent",type:SyncableParameterType.NUMBER,canSync:!0},{key:"xtc_probability",serverKey:"xtc_probability",type:SyncableParameterType.NUMBER,canSync:!0},{key:"xtc_threshold",serverKey:"xtc_threshold",type:SyncableParameterType.NUMBER,canSync:!0},{key:"typ_p",serverKey:"typ_p",type:SyncableParameterType.NUMBER,canSync:!0},{key:"repeat_last_n",serverKey:"repeat_last_n",type:SyncableParameterType.
-NUMBER,canSync:!0},{key:"repeat_penalty",serverKey:"repeat_penalty",type:SyncableParameterType.NUMBER,canSync:!0},{key:"presence_penalty",serverKey:"presence_penalty",type:SyncableParameterType.NUMBER,canSync:!0},{key:"frequency_penalty",serverKey:"frequency_penalty",type:SyncableParameterType.NUMBER,canSync:!0},{key:"dry_multiplier",serverKey:"dry_multiplier",type:SyncableParameterType.NUMBER,canSync:!0},{key:"dry_base",serverKey:"dry_base",type:SyncableParameterType.NUMBER,canSync:!0},{key:"dr\
-y_allowed_length",serverKey:"dry_allowed_length",type:SyncableParameterType.NUMBER,canSync:!0},{key:"dry_penalty_last_n",serverKey:"dry_penalty_last_n",type:SyncableParameterType.NUMBER,canSync:!0},{key:"max_tokens",serverKey:"max_tokens",type:SyncableParameterType.NUMBER,canSync:!0},{key:"samplers",serverKey:"samplers",type:SyncableParameterType.STRING,canSync:!0},{key:"backend_sampling",serverKey:"backend_sampling",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"pasteLongTextToFileLen",serverKey:"\
-pasteLongTextToFileLen",type:SyncableParameterType.NUMBER,canSync:!0},{key:"pdfAsImage",serverKey:"pdfAsImage",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"showThoughtInProgress",serverKey:"showThoughtInProgress",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"keepStatsVisible",serverKey:"keepStatsVisible",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"showMessageStats",serverKey:"showMessageStats",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"askForTitleConfirmation",serverKey:"\
-askForTitleConfirmation",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"titleGenerationUseFirstLine",serverKey:"titleGenerationUseFirstLine",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"disableAutoScroll",serverKey:"disableAutoScroll",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"renderUserContentAsMarkdown",serverKey:"renderUserContentAsMarkdown",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"autoMicOnEmpty",serverKey:"autoMicOnEmpty",type:SyncableParameterType.BOOLEAN,
-canSync:!0},{key:"pyInterpreterEnabled",serverKey:"pyInterpreterEnabled",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"enableContinueGeneration",serverKey:"enableContinueGeneration",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"fullHeightCodeBlocks",serverKey:"fullHeightCodeBlocks",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"systemMessage",serverKey:"systemMessage",type:SyncableParameterType.STRING,canSync:!0},{key:"showSystemMessage",serverKey:"showSystemMessage",type:SyncableParameterType.
-BOOLEAN,canSync:!0},{key:"theme",serverKey:"theme",type:SyncableParameterType.STRING,canSync:!0},{key:"copyTextAttachmentsAsPlainText",serverKey:"copyTextAttachmentsAsPlainText",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"showRawOutputSwitch",serverKey:"showRawOutputSwitch",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"alwaysShowSidebarOnDesktop",serverKey:"alwaysShowSidebarOnDesktop",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"showRawModelNames",serverKey:"showRawModelN\
-ames",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"mcpServers",serverKey:"mcpServers",type:SyncableParameterType.STRING,canSync:!0},{key:"agenticMaxTurns",serverKey:"agenticMaxTurns",type:SyncableParameterType.NUMBER,canSync:!0},{key:"agenticMaxToolPreviewLines",serverKey:"agenticMaxToolPreviewLines",type:SyncableParameterType.NUMBER,canSync:!0},{key:"showToolCallInProgress",serverKey:"showToolCallInProgress",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"alwaysShowAgenticTurns",serverKey:"\
-alwaysShowAgenticTurns",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"excludeReasoningFromContext",serverKey:"excludeReasoningFromContext",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"sendOnEnter",serverKey:"sendOnEnter",type:SyncableParameterType.BOOLEAN,canSync:!0}];class ParameterSyncService{static roundFloatingPoint(value){return normalizeFloatingPoint(value)}static extractServerDefaults(serverParams,webuiSettings){const extracted={};if(serverParams){for(const param of SYNCABLE_PARAMETERS)
-if(param.canSync&&param.serverKey in serverParams){const value=serverParams[param.serverKey];value!==void 0&&(extracted[param.key]=this.roundFloatingPoint(value))}serverParams.samplers&&Array.isArray(serverParams.samplers)&&(extracted.samplers=serverParams.samplers.join(";"))}if(webuiSettings){for(const param of SYNCABLE_PARAMETERS)if(param.canSync&&param.serverKey in webuiSettings){const value=webuiSettings[param.serverKey];value!==void 0&&(extracted[param.key]=this.roundFloatingPoint(value))}}
-return extracted}static mergeWithServerDefaults(currentSettings,serverDefaults,userOverrides=new Set){const merged={...currentSettings};for(const[key2,serverValue]of Object.entries(serverDefaults))userOverrides.has(key2)||(merged[key2]=this.roundFloatingPoint(serverValue));return merged}static getParameterInfo(key2,currentValue,propsDefaults,userOverrides){const hasPropsDefault=propsDefaults[key2]!==void 0,isUserOverride=userOverrides.has(key2),source2=isUserOverride?ParameterSource.CUSTOM:ParameterSource.
-DEFAULT;return{value:currentValue,source:source2,serverDefault:hasPropsDefault?propsDefaults[key2]:void 0,userOverride:isUserOverride?currentValue:void 0}}static canSyncParameter(key2){return SYNCABLE_PARAMETERS.some(param=>param.key===key2&&param.canSync)}static getSyncableParameterKeys(){return SYNCABLE_PARAMETERS.filter(param=>param.canSync).map(param=>param.key)}static validateServerParameter(key2,value){const param=SYNCABLE_PARAMETERS.find(p2=>p2.key===key2);if(!param)return!1;switch(param.
-type){case SyncableParameterType.NUMBER:return typeof value=="number"&&!isNaN(value);case SyncableParameterType.STRING:return typeof value=="string";case SyncableParameterType.BOOLEAN:return typeof value=="boolean";default:return!1}}static createParameterDiff(currentSettings,serverDefaults){const diff2={};for(const key2 of this.getSyncableParameterKeys()){const currentValue=currentSettings[key2],serverValue=serverDefaults[key2];serverValue!==void 0&&(diff2[key2]={current:currentValue,server:serverValue,
-differs:currentValue!==serverValue})}return diff2}}class PropsService{static async fetch(autoload=!1){const params={};return autoload||(params.autoload="false"),apiFetchWithParams("./props",params,{authOnly:!0})}static async fetchForModel(modelId,autoload=!1){const params={model:modelId};return autoload||(params.autoload="false"),apiFetchWithParams("./props",params,{authOnly:!0})}}class ServerStore{#props=state$1(null);get props(){return get$3(this.#props)}set props(value){set$1(this.#props,value,
-!0)}#loading=state$1(!1);get loading(){return get$3(this.#loading)}set loading(value){set$1(this.#loading,value,!0)}#error=state$1(null);get error(){return get$3(this.#error)}set error(value){set$1(this.#error,value,!0)}#role=state$1(null);get role(){return get$3(this.#role)}set role(value){set$1(this.#role,value,!0)}fetchPromise=null;get defaultParams(){return this.props?.default_generation_settings?.params||null}get contextSize(){const nCtx=this.props?.default_generation_settings?.n_ctx;return typeof nCtx==
-"number"?nCtx:null}get webuiSettings(){return this.props?.webui_settings}get isRouterMode(){return this.role===ServerRole.ROUTER}get isModelMode(){return this.role===ServerRole.MODEL}async fetch(){if(this.fetchPromise)return this.fetchPromise;this.loading=!0,this.error=null;const fetchPromise=(async()=>{try{const props=await PropsService.fetch();this.props=props,this.error=null,this.detectRole(props)}catch(error2){this.error=this.getErrorMessage(error2),console.error("Error fetching server prope\
-rties:",error2)}finally{this.loading=!1,this.fetchPromise=null}})();this.fetchPromise=fetchPromise,await fetchPromise}getErrorMessage(error2){if(error2 instanceof Error){const message=error2.message||"";if(error2.name==="TypeError"&&message.includes("fetch"))return"Server is not running or unreachable";if(message.includes("ECONNREFUSED"))return"Connection refused - server may be offline";if(message.includes("ENOTFOUND"))return"Server not found - check server address";if(message.includes("ETIMEDO\
-UT"))return"Request timed out";if(message.includes("503"))return"Server temporarily unavailable";if(message.includes("500"))return"Server error - check server logs";if(message.includes("404"))return"Server endpoint not found";if(message.includes("403")||message.includes("401"))return"Access denied"}return"Failed to connect to server"}clear(){this.props=null,this.error=null,this.loading=!1,this.role=null,this.fetchPromise=null}detectRole(props){const newRole=props?.role===ServerRole.ROUTER?ServerRole.
-ROUTER:ServerRole.MODEL;this.role!==newRole&&(this.role=newRole,console.info(`Server running in ${newRole===ServerRole.ROUTER?"ROUTER":"MODEL"} mode`))}}const serverStore=new ServerStore,serverProps=()=>serverStore.props,serverLoading=()=>serverStore.loading,serverError=()=>serverStore.error,contextSize=()=>serverStore.contextSize,isRouterMode=()=>serverStore.isRouterMode;class SettingsStore{#config=state$1(proxy({...SETTING_CONFIG_DEFAULT}));get config(){return get$3(this.#config)}set config(value){
-set$1(this.#config,value,!0)}#theme=state$1("auto");get theme(){return get$3(this.#theme)}set theme(value){set$1(this.#theme,value,!0)}#isInitialized=state$1(!1);get isInitialized(){return get$3(this.#isInitialized)}set isInitialized(value){set$1(this.#isInitialized,value,!0)}#userOverrides=state$1(proxy(new Set));get userOverrides(){return get$3(this.#userOverrides)}set userOverrides(value){set$1(this.#userOverrides,value,!0)}getServerDefaults(){const serverParams=serverStore.defaultParams,webuiSettings=serverStore.
-webuiSettings;return ParameterSyncService.extractServerDefaults(serverParams,webuiSettings)}constructor(){this.initialize()}initialize(){try{this.loadConfig(),this.loadTheme(),this.isInitialized=!0}catch(error2){console.error("Failed to initialize settings store:",error2)}}loadConfig(){try{const storedConfigRaw=localStorage.getItem(CONFIG_LOCALSTORAGE_KEY),savedVal=JSON.parse(storedConfigRaw||"{}");this.config={...SETTING_CONFIG_DEFAULT,...savedVal},"sendOnEnter"in savedVal||new IsMobile().current&&
-(this.config.sendOnEnter=!1);const savedOverrides=JSON.parse(localStorage.getItem(USER_OVERRIDES_LOCALSTORAGE_KEY)||"[]");this.userOverrides=new Set(savedOverrides)}catch(error2){console.warn("Failed to parse config from localStorage, using defaults:",error2),this.config={...SETTING_CONFIG_DEFAULT},this.userOverrides=new Set}}loadTheme(){this.theme=localStorage.getItem("theme")||"auto"}updateConfig(key2,value){if(this.config[key2]=value,ParameterSyncService.canSyncParameter(key2)){const propsDefault=this.
-getServerDefaults()[key2];if(propsDefault!==void 0){const normalizedValue=normalizeFloatingPoint(value),normalizedDefault=normalizeFloatingPoint(propsDefault);normalizedValue===normalizedDefault?this.userOverrides.delete(key2):this.userOverrides.add(key2)}}this.saveConfig()}updateMultipleConfig(updates){Object.assign(this.config,updates);const propsDefaults=this.getServerDefaults();for(const[key2,value]of Object.entries(updates))if(ParameterSyncService.canSyncParameter(key2)){const propsDefault=propsDefaults[key2];
-if(propsDefault!==void 0){const normalizedValue=normalizeFloatingPoint(value),normalizedDefault=normalizeFloatingPoint(propsDefault);normalizedValue===normalizedDefault?this.userOverrides.delete(key2):this.userOverrides.add(key2)}}this.saveConfig()}saveConfig(){try{localStorage.setItem(CONFIG_LOCALSTORAGE_KEY,JSON.stringify(this.config)),localStorage.setItem(USER_OVERRIDES_LOCALSTORAGE_KEY,JSON.stringify(Array.from(this.userOverrides)))}catch(error2){console.error("Failed to save config to local\
-Storage:",error2)}}updateTheme(newTheme){this.theme=newTheme,this.saveTheme()}saveTheme(){try{this.theme==="auto"?localStorage.removeItem("theme"):localStorage.setItem("theme",this.theme)}catch(error2){console.error("Failed to save theme to localStorage:",error2)}}resetConfig(){this.config={...SETTING_CONFIG_DEFAULT},this.saveConfig()}resetTheme(){this.theme="auto",this.saveTheme()}resetAll(){this.resetConfig(),this.resetTheme()}resetParameterToServerDefault(key2){const serverDefaults=this.getServerDefaults(),
-webuiSettings=serverStore.webuiSettings;webuiSettings&&key2 in webuiSettings?setConfigValue(this.config,key2,webuiSettings[key2]):serverDefaults[key2]!==void 0?setConfigValue(this.config,key2,""):key2 in SETTING_CONFIG_DEFAULT&&setConfigValue(this.config,key2,getConfigValue(SETTING_CONFIG_DEFAULT,key2)),this.userOverrides.delete(key2),this.saveConfig()}syncWithServerDefaults(){const propsDefaults=this.getServerDefaults();if(Object.keys(propsDefaults).length===0)return;const webuiSettings=serverStore.
-webuiSettings,webuiSettingsKeys=new Set(webuiSettings?Object.keys(webuiSettings):[]);for(const[key2,propsValue]of Object.entries(propsDefaults)){const currentValue=getConfigValue(this.config,key2),normalizedCurrent=normalizeFloatingPoint(currentValue),normalizedDefault=normalizeFloatingPoint(propsValue);normalizedCurrent===normalizedDefault&&(this.userOverrides.delete(key2),!webuiSettingsKeys.has(key2)&&getConfigValue(SETTING_CONFIG_DEFAULT,key2)===void 0&&setConfigValue(this.config,key2,void 0))}
-if(webuiSettings)for(const[key2,value]of Object.entries(webuiSettings))!this.userOverrides.has(key2)&&value!==void 0&&setConfigValue(this.config,key2,value);this.saveConfig(),console.log("User overrides after sync:",Array.from(this.userOverrides))}forceSyncWithServerDefaults(){const propsDefaults=this.getServerDefaults(),webuiSettings=serverStore.webuiSettings;for(const key2 of ParameterSyncService.getSyncableParameterKeys())webuiSettings&&key2 in webuiSettings?setConfigValue(this.config,key2,webuiSettings[key2]):
-propsDefaults[key2]!==void 0?setConfigValue(this.config,key2,""):key2 in SETTING_CONFIG_DEFAULT&&setConfigValue(this.config,key2,getConfigValue(SETTING_CONFIG_DEFAULT,key2)),this.userOverrides.delete(key2);this.saveConfig()}getConfig(key2){return this.config[key2]}getAllConfig(){return{...this.config}}canSyncParameter(key2){return ParameterSyncService.canSyncParameter(key2)}getParameterInfo(key2){const propsDefaults=this.getServerDefaults(),currentValue=getConfigValue(this.config,key2);return ParameterSyncService.
-getParameterInfo(key2,currentValue??"",propsDefaults,this.userOverrides)}getParameterDiff(){const serverDefaults=this.getServerDefaults();if(Object.keys(serverDefaults).length===0)return{};const configAsRecord=configToParameterRecord(this.config,ParameterSyncService.getSyncableParameterKeys());return ParameterSyncService.createParameterDiff(configAsRecord,serverDefaults)}clearAllUserOverrides(){this.userOverrides.clear(),this.saveConfig(),console.log("Cleared all user overrides")}}const settingsStore=new SettingsStore,
-config$1=()=>settingsStore.config;function redactValue(value,showLastChars){return showLastChars?`....${value.slice(-showLastChars)}`:"[redacted]"}function getAuthHeaders(){const apiKey=config$1().apiKey?.toString().trim();return apiKey?{Authorization:`Bearer ${apiKey}`}:{}}function getJsonHeaders(){return{"Content-Type":"application/json",...getAuthHeaders()}}function sanitizeHeaders(headers,extraRedactedHeaders,partialRedactHeaders){if(!headers)return{};const normalized=new Headers(headers),sanitized={},
-redactedHeaders=new Set(Array.from(extraRedactedHeaders??[],header=>header.toLowerCase()));for(const[key2,value]of normalized.entries()){const normalizedKey=key2.toLowerCase(),partialChars=partialRedactHeaders?.get(normalizedKey);partialChars!==void 0?sanitized[key2]=redactValue(value,partialChars):REDACTED_HEADERS.has(normalizedKey)||redactedHeaders.has(normalizedKey)?sanitized[key2]=redactValue(value):sanitized[key2]=value}return sanitized}async function apiFetch(path2,options={}){const{authOnly=!1,
-headers:customHeaders,...fetchOptions}=options,headers={...authOnly?getAuthHeaders():getJsonHeaders(),...customHeaders},url2=path2.startsWith(UrlProtocol.HTTP)||path2.startsWith(UrlProtocol.HTTPS)?path2:`${base}${path2}`,response=await fetch(url2,{...fetchOptions,headers});if(!response.ok){const errorMessage=await parseErrorMessage(response);throw new Error(errorMessage)}return response.json()}async function apiFetchWithParams(basePath,params,options={}){const url2=new URL(basePath,window.location.
-href);for(const[key2,value]of Object.entries(params))value!=null&&url2.searchParams.set(key2,value);const{authOnly=!1,headers:customHeaders,...fetchOptions}=options,headers={...authOnly?getAuthHeaders():getJsonHeaders(),...customHeaders},response=await fetch(url2.toString(),{...fetchOptions,headers});if(!response.ok){const errorMessage=await parseErrorMessage(response);throw new Error(errorMessage)}return response.json()}async function apiPost(path2,body2,options={}){return apiFetch(path2,{method:"\
-POST",body:JSON.stringify(body2),...options})}async function parseErrorMessage(response){try{const errorData=await response.json();if(errorData?.error?.message)return errorData.error.message;if(errorData?.error&&typeof errorData.error=="string")return errorData.error;if(errorData?.message)return errorData.message}catch{}return`Request failed: ${response.status} ${response.statusText}`}function error(status,body2){throw new HttpError(status,body2)}async function validateApiKey(fetch2){try{const apiKey=config$1().
-apiKey,headers={"Content-Type":"application/json"};apiKey&&(headers.Authorization=`Bearer ${apiKey}`);const response=await fetch2(`${base}/props`,{headers});if(!response.ok){if(response.status===401||response.status===403)throw error(401,"Access denied");console.warn(`Server responded with status ${response.status} during API key validation`);return}}catch(err){if(err&&typeof err=="object"&&"status"in err)throw err;console.warn("Cannot connect to server for API key validation:",err)}}function isMcpPrompt(item){
-return!!(item.attachment?.type===AttachmentType.MCP_PROMPT||item.uploadedFile?.type===SpecialFileType.MCP_PROMPT&&item.uploadedFile.mcpPrompt)}function isMcpResource(item){return item.attachment?.type===AttachmentType.MCP_RESOURCE}function getUploadedFileCategory$1(file){const categoryByMime=getFileTypeCategory(file.type);return categoryByMime||getFileTypeCategoryByExtension(file.name)}function getAttachmentDisplayItems(options){const{uploadedFiles=[],attachments=[]}=options,items2=[];for(const file of uploadedFiles)
-items2.push({id:file.id,name:file.name,size:file.size,preview:file.preview,isImage:getUploadedFileCategory$1(file)===FileTypeCategory.IMAGE,isLoading:file.isLoading,loadError:file.loadError,uploadedFile:file,textContent:file.textContent});for(const[index2,attachment]of attachments.entries()){const isImage2=isImageFile(attachment);items2.push({id:`attachment-${index2}`,name:attachment.name,size:"size"in attachment?attachment.size:void 0,preview:isImage2&&"base64Url"in attachment?attachment.base64Url:
-void 0,isImage:isImage2,attachment,attachmentIndex:index2,textContent:"content"in attachment?attachment.content:void 0})}return items2.reverse()}function getUploadedFileCategory(uploadedFile){const categoryByMime=getFileTypeCategory(uploadedFile.type);return categoryByMime||getFileTypeCategoryByExtension(uploadedFile.name)}function isTextFile(attachment,uploadedFile){return uploadedFile?getUploadedFileCategory(uploadedFile)===FileTypeCategory.TEXT:attachment?attachment.type===AttachmentType.TEXT||
-attachment.type===AttachmentType.LEGACY_CONTEXT:!1}function isImageFile(attachment,uploadedFile){return uploadedFile?getUploadedFileCategory(uploadedFile)===FileTypeCategory.IMAGE:attachment?attachment.type===AttachmentType.IMAGE:!1}function isPdfFile$1(attachment,uploadedFile){return uploadedFile?getUploadedFileCategory(uploadedFile)===FileTypeCategory.PDF:attachment?attachment.type===AttachmentType.PDF:!1}function isAudioFile(attachment,uploadedFile){return uploadedFile?getUploadedFileCategory(
-uploadedFile)===FileTypeCategory.AUDIO:attachment?attachment.type===AttachmentType.AUDIO:!1}function autoResizeTextarea(textareaElement){textareaElement&&(textareaElement.style.height="1rem",textareaElement.style.height=textareaElement.scrollHeight+"px")}function findMessageById(messages,id2){if(id2)return messages.find(m=>m.id===id2)}function filterByLeafNodeId(messages,leafNodeId,includeRoot=!1){const result=[],nodeMap=new Map;for(const msg of messages)nodeMap.set(msg.id,msg);let startNode=nodeMap.
-get(leafNodeId);if(!startNode){let latestTime=-1;for(const msg of messages)msg.timestamp>latestTime&&(startNode=msg,latestTime=msg.timestamp)}let currentNode=startNode;for(;currentNode&&((currentNode.type!=="root"||includeRoot)&&result.push(currentNode),currentNode.parent!==null);)currentNode=nodeMap.get(currentNode.parent);return result.sort((a,b)=>a.role===MessageRole.SYSTEM&&b.role!==MessageRole.SYSTEM?-1:a.role!==MessageRole.SYSTEM&&b.role===MessageRole.SYSTEM?1:a.timestamp-b.timestamp),result}
-function findLeafNode(messages,messageId){const nodeMap=new Map;for(const msg of messages)nodeMap.set(msg.id,msg);let currentNode=nodeMap.get(messageId);for(;currentNode&&currentNode.children.length>0;){const lastChildId=currentNode.children[currentNode.children.length-1];currentNode=nodeMap.get(lastChildId)}return currentNode?.id??messageId}function findDescendantMessages(messages,messageId){const nodeMap=new Map;for(const msg of messages)nodeMap.set(msg.id,msg);const descendants=[],queue=[messageId];
-for(;queue.length>0;){const currentId=queue.shift(),currentNode=nodeMap.get(currentId);if(currentNode)for(const childId of currentNode.children)descendants.push(childId),queue.push(childId)}return descendants}function getMessageSiblings(messages,messageId){const nodeMap=new Map;for(const msg of messages)nodeMap.set(msg.id,msg);const message=nodeMap.get(messageId);if(!message)return null;if(message.parent===null)return{message,siblingIds:[messageId],currentIndex:0,totalSiblings:1};const parentNode=nodeMap.
-get(message.parent);if(!parentNode)return{message,siblingIds:[messageId],currentIndex:0,totalSiblings:1};const siblingIds=parentNode.children,siblingLeafIds=siblingIds.map(siblingId=>findLeafNode(messages,siblingId)),currentIndex=siblingIds.indexOf(messageId);return{message,siblingIds:siblingLeafIds,currentIndex,totalSiblings:siblingIds.length}}var core$5,hasRequiredCore$4;function requireCore$4(){if(hasRequiredCore$4)return core$5;hasRequiredCore$4=1;function deepFreeze(obj){return obj instanceof
-Map?obj.clear=obj.delete=obj.set=function(){throw new Error("map is read-only")}:obj instanceof Set&&(obj.add=obj.clear=obj.delete=function(){throw new Error("set is read-only")}),Object.freeze(obj),Object.getOwnPropertyNames(obj).forEach(name=>{const prop2=obj[name],type2=typeof prop2;(type2==="object"||type2==="function")&&!Object.isFrozen(prop2)&&deepFreeze(prop2)}),obj}class Response2{constructor(mode){mode.data===void 0&&(mode.data={}),this.data=mode.data,this.isMatchIgnored=!1}ignoreMatch(){
-this.isMatchIgnored=!0}}function escapeHTML(value){return value.replace(/&/g,"&amp;").replace(/</g,"&lt;").replace(/>/g,"&gt;").replace(/"/g,"&quot;").replace(/'/g,"&#x27;")}function inherit$1(original,...objects){const result=Object.create(null);for(const key2 in original)result[key2]=original[key2];return objects.forEach(function(obj){for(const key2 in obj)result[key2]=obj[key2]}),result}const SPAN_CLOSE="</span>",emitsWrappingTags=node2=>!!node2.scope,scopeToCSSClass=(name,{prefix})=>{if(name.
-startsWith("language:"))return name.replace("language:","language-");if(name.includes(".")){const pieces=name.split(".");return[`${prefix}${pieces.shift()}`,...pieces.map((x,i)=>`${x}${"_".repeat(i+1)}`)].join(" ")}return`${prefix}${name}`};class HTMLRenderer{constructor(parseTree3,options){this.buffer="",this.classPrefix=options.classPrefix,parseTree3.walk(this)}addText(text2){this.buffer+=escapeHTML(text2)}openNode(node2){if(!emitsWrappingTags(node2))return;const className=scopeToCSSClass(node2.
-scope,{prefix:this.classPrefix});this.span(className)}closeNode(node2){emitsWrappingTags(node2)&&(this.buffer+=SPAN_CLOSE)}value(){return this.buffer}span(className){this.buffer+=`<span class="${className}">`}}const newNode=(opts={})=>{const result={children:[]};return Object.assign(result,opts),result};class TokenTree{constructor(){this.rootNode=newNode(),this.stack=[this.rootNode]}get top(){return this.stack[this.stack.length-1]}get root(){return this.rootNode}add(node2){this.top.children.push(
-node2)}openNode(scope2){const node2=newNode({scope:scope2});this.add(node2),this.stack.push(node2)}closeNode(){if(this.stack.length>1)return this.stack.pop()}closeAllNodes(){for(;this.closeNode(););}toJSON(){return JSON.stringify(this.rootNode,null,4)}walk(builder){return this.constructor._walk(builder,this.rootNode)}static _walk(builder,node2){return typeof node2=="string"?builder.addText(node2):node2.children&&(builder.openNode(node2),node2.children.forEach(child2=>this._walk(builder,child2)),
-builder.closeNode(node2)),builder}static _collapse(node2){typeof node2!="string"&&node2.children&&(node2.children.every(el=>typeof el=="string")?node2.children=[node2.children.join("")]:node2.children.forEach(child2=>{TokenTree._collapse(child2)}))}}class TokenTreeEmitter extends TokenTree{constructor(options){super(),this.options=options}addText(text2){text2!==""&&this.add(text2)}startScope(scope2){this.openNode(scope2)}endScope(){this.closeNode()}__addSublanguage(emitter,name){const node2=emitter.
-root;name&&(node2.scope=`language:${name}`),this.add(node2)}toHTML(){return new HTMLRenderer(this,this.options).value()}finalize(){return this.closeAllNodes(),!0}}function source2(re2){return re2?typeof re2=="string"?re2:re2.source:null}function lookahead2(re2){return concat2("(?=",re2,")")}function anyNumberOfTimes(re2){return concat2("(?:",re2,")*")}function optional2(re2){return concat2("(?:",re2,")?")}function concat2(...args){return args.map(x=>source2(x)).join("")}function stripOptionsFromArgs2(args){
-const opts=args[args.length-1];return typeof opts=="object"&&opts.constructor===Object?(args.splice(args.length-1,1),opts):{}}function either2(...args){return"("+(stripOptionsFromArgs2(args).capture?"":"?:")+args.map(x=>source2(x)).join("|")+")"}function countMatchGroups(re2){return new RegExp(re2.toString()+"|").exec("").length-1}function startsWith(re2,lexeme){const match=re2&&re2.exec(lexeme);return match&&match.index===0}const BACKREF_RE=/\[(?:[^\\\]]|\\.)*\]|\(\??|\\([1-9][0-9]*)|\\./;function _rewriteBackreferences(regexps,{
-joinWith}){let numCaptures=0;return regexps.map(regex=>{numCaptures+=1;const offset2=numCaptures;let re2=source2(regex),out="";for(;re2.length>0;){const match=BACKREF_RE.exec(re2);if(!match){out+=re2;break}out+=re2.substring(0,match.index),re2=re2.substring(match.index+match[0].length),match[0][0]==="\\"&&match[1]?out+="\\"+String(Number(match[1])+offset2):(out+=match[0],match[0]==="("&&numCaptures++)}return out}).map(re2=>`(${re2})`).join(joinWith)}const MATCH_NOTHING_RE=/\b\B/,IDENT_RE2="[a-zA\
--Z]\\w*",UNDERSCORE_IDENT_RE="[a-zA-Z_]\\w*",NUMBER_RE="\\b\\d+(\\.\\d+)?",C_NUMBER_RE="(-?)(\\b0[xX][a-fA-F0-9]+|(\\b\\d+(\\.\\d*)?|\\.\\d+)([eE][-+]?\\d+)?)",BINARY_NUMBER_RE="\\b(0b[01]+)",RE_STARTERS_RE="!|!=|!==|%|%=|&|&&|&=|\\*|\\*=|\\+|\\+=|,|-|-=|/=|/|:|;|<<|<<=|<=|<|===|==|=|>>>=|>>=|>=|>>>|>>|>|\\?|\\[|\\{|\\(|\\^|\\^=|\\||\\|=|\\|\\||~",SHEBANG=(opts={})=>{const beginShebang=/^#![ ]*\//;return opts.binary&&(opts.begin=concat2(beginShebang,/.*\b/,opts.binary,/\b.*/)),inherit$1({scope:"\
-meta",begin:beginShebang,end:/$/,relevance:0,"on:begin":(m,resp)=>{m.index!==0&&resp.ignoreMatch()}},opts)},BACKSLASH_ESCAPE={begin:"\\\\[\\s\\S]",relevance:0},APOS_STRING_MODE={scope:"string",begin:"'",end:"'",illegal:"\\n",contains:[BACKSLASH_ESCAPE]},QUOTE_STRING_MODE={scope:"string",begin:'"',end:'"',illegal:"\\n",contains:[BACKSLASH_ESCAPE]},PHRASAL_WORDS_MODE={begin:/\b(a|an|the|are|I'm|isn't|don't|doesn't|won't|but|just|should|pretty|simply|enough|gonna|going|wtf|so|such|will|you|your|they|like|more)\b/},
-COMMENT=function(begin,end,modeOptions={}){const mode=inherit$1({scope:"comment",begin,end,contains:[]},modeOptions);mode.contains.push({scope:"doctag",begin:"[ ]*(?=(TODO|FIXME|NOTE|BUG|OPTIMIZE|HACK|XXX):)",end:/(TODO|FIXME|NOTE|BUG|OPTIMIZE|HACK|XXX):/,excludeBegin:!0,relevance:0});const ENGLISH_WORD=either2("I","a","is","so","us","to","at","if","in","it","on",/[A-Za-z]+['](d|ve|re|ll|t|s|n)/,/[A-Za-z]+[-][a-z]+/,/[A-Za-z][a-z]{2,}/);return mode.contains.push({begin:concat2(/[ ]+/,"(",ENGLISH_WORD,
-/[.]?[:]?([.][ ]|[ ])/,"){3}")}),mode},C_LINE_COMMENT_MODE=COMMENT("//","$"),C_BLOCK_COMMENT_MODE=COMMENT("/\\*","\\*/"),HASH_COMMENT_MODE=COMMENT("#","$"),NUMBER_MODE={scope:"number",begin:NUMBER_RE,relevance:0},C_NUMBER_MODE={scope:"number",begin:C_NUMBER_RE,relevance:0},BINARY_NUMBER_MODE={scope:"number",begin:BINARY_NUMBER_RE,relevance:0},REGEXP_MODE={scope:"regexp",begin:/\/(?=[^/\n]*\/)/,end:/\/[gimuy]*/,contains:[BACKSLASH_ESCAPE,{begin:/\[/,end:/\]/,relevance:0,contains:[BACKSLASH_ESCAPE]}]},
-TITLE_MODE={scope:"title",begin:IDENT_RE2,relevance:0},UNDERSCORE_TITLE_MODE={scope:"title",begin:UNDERSCORE_IDENT_RE,relevance:0},METHOD_GUARD={begin:"\\.\\s*"+UNDERSCORE_IDENT_RE,relevance:0};var MODES2=Object.freeze({__proto__:null,APOS_STRING_MODE,BACKSLASH_ESCAPE,BINARY_NUMBER_MODE,BINARY_NUMBER_RE,COMMENT,C_BLOCK_COMMENT_MODE,C_LINE_COMMENT_MODE,C_NUMBER_MODE,C_NUMBER_RE,END_SAME_AS_BEGIN:function(mode){return Object.assign(mode,{"on:begin":(m,resp)=>{resp.data._beginMatch=m[1]},"on:end":(m,resp)=>{
-resp.data._beginMatch!==m[1]&&resp.ignoreMatch()}})},HASH_COMMENT_MODE,IDENT_RE:IDENT_RE2,MATCH_NOTHING_RE,METHOD_GUARD,NUMBER_MODE,NUMBER_RE,PHRASAL_WORDS_MODE,QUOTE_STRING_MODE,REGEXP_MODE,RE_STARTERS_RE,SHEBANG,TITLE_MODE,UNDERSCORE_IDENT_RE,UNDERSCORE_TITLE_MODE});function skipIfHasPrecedingDot(match,response){match.input[match.index-1]==="."&&response.ignoreMatch()}function scopeClassName(mode,_parent){mode.className!==void 0&&(mode.scope=mode.className,delete mode.className)}function beginKeywords(mode,parent){
-parent&&mode.beginKeywords&&(mode.begin="\\b("+mode.beginKeywords.split(" ").join("|")+")(?!\\.)(?=\\b|\\s)",mode.__beforeBegin=skipIfHasPrecedingDot,mode.keywords=mode.keywords||mode.beginKeywords,delete mode.beginKeywords,mode.relevance===void 0&&(mode.relevance=0))}function compileIllegal(mode,_parent){Array.isArray(mode.illegal)&&(mode.illegal=either2(...mode.illegal))}function compileMatch(mode,_parent){if(mode.match){if(mode.begin||mode.end)throw new Error("begin & end are not supported wi\
-th match");mode.begin=mode.match,delete mode.match}}function compileRelevance(mode,_parent){mode.relevance===void 0&&(mode.relevance=1)}const beforeMatchExt=(mode,parent)=>{if(!mode.beforeMatch)return;if(mode.starts)throw new Error("beforeMatch cannot be used with starts");const originalMode=Object.assign({},mode);Object.keys(mode).forEach(key2=>{delete mode[key2]}),mode.keywords=originalMode.keywords,mode.begin=concat2(originalMode.beforeMatch,lookahead2(originalMode.begin)),mode.starts={relevance:0,
-contains:[Object.assign(originalMode,{endsParent:!0})]},mode.relevance=0,delete originalMode.beforeMatch},COMMON_KEYWORDS=["of","and","for","in","not","or","if","then","parent","list","value"],DEFAULT_KEYWORD_SCOPE="keyword";function compileKeywords(rawKeywords,caseInsensitive,scopeName=DEFAULT_KEYWORD_SCOPE){const compiledKeywords=Object.create(null);return typeof rawKeywords=="string"?compileList(scopeName,rawKeywords.split(" ")):Array.isArray(rawKeywords)?compileList(scopeName,rawKeywords):Object.
-keys(rawKeywords).forEach(function(scopeName2){Object.assign(compiledKeywords,compileKeywords(rawKeywords[scopeName2],caseInsensitive,scopeName2))}),compiledKeywords;function compileList(scopeName2,keywordList){caseInsensitive&&(keywordList=keywordList.map(x=>x.toLowerCase())),keywordList.forEach(function(keyword2){const pair=keyword2.split("|");compiledKeywords[pair[0]]=[scopeName2,scoreForKeyword(pair[0],pair[1])]})}}function scoreForKeyword(keyword2,providedScore){return providedScore?Number(
-providedScore):commonKeyword(keyword2)?0:1}function commonKeyword(keyword2){return COMMON_KEYWORDS.includes(keyword2.toLowerCase())}const seenDeprecations={},error2=message=>{console.error(message)},warn2=(message,...args)=>{console.log(`WARN: ${message}`,...args)},deprecated2=(version3,message)=>{seenDeprecations[`${version3}/${message}`]||(console.log(`Deprecated as of ${version3}. ${message}`),seenDeprecations[`${version3}/${message}`]=!0)},MultiClassError=new Error;function remapScopeNames(mode,regexes,{
-key:key2}){let offset2=0;const scopeNames=mode[key2],emit={},positions={};for(let i=1;i<=regexes.length;i++)positions[i+offset2]=scopeNames[i],emit[i+offset2]=!0,offset2+=countMatchGroups(regexes[i-1]);mode[key2]=positions,mode[key2]._emit=emit,mode[key2]._multi=!0}function beginMultiClass(mode){if(Array.isArray(mode.begin)){if(mode.skip||mode.excludeBegin||mode.returnBegin)throw error2("skip, excludeBegin, returnBegin not compatible with beginScope: {}"),MultiClassError;if(typeof mode.beginScope!=
-"object"||mode.beginScope===null)throw error2("beginScope must be object"),MultiClassError;remapScopeNames(mode,mode.begin,{key:"beginScope"}),mode.begin=_rewriteBackreferences(mode.begin,{joinWith:""})}}function endMultiClass(mode){if(Array.isArray(mode.end)){if(mode.skip||mode.excludeEnd||mode.returnEnd)throw error2("skip, excludeEnd, returnEnd not compatible with endScope: {}"),MultiClassError;if(typeof mode.endScope!="object"||mode.endScope===null)throw error2("endScope must be object"),MultiClassError;
-remapScopeNames(mode,mode.end,{key:"endScope"}),mode.end=_rewriteBackreferences(mode.end,{joinWith:""})}}function scopeSugar(mode){mode.scope&&typeof mode.scope=="object"&&mode.scope!==null&&(mode.beginScope=mode.scope,delete mode.scope)}function MultiClass(mode){scopeSugar(mode),typeof mode.beginScope=="string"&&(mode.beginScope={_wrap:mode.beginScope}),typeof mode.endScope=="string"&&(mode.endScope={_wrap:mode.endScope}),beginMultiClass(mode),endMultiClass(mode)}function compileLanguage(language2){
-function langRe(value,global2){return new RegExp(source2(value),"m"+(language2.case_insensitive?"i":"")+(language2.unicodeRegex?"u":"")+(global2?"g":""))}class MultiRegex{constructor(){this.matchIndexes={},this.regexes=[],this.matchAt=1,this.position=0}addRule(re2,opts){opts.position=this.position++,this.matchIndexes[this.matchAt]=opts,this.regexes.push([opts,re2]),this.matchAt+=countMatchGroups(re2)+1}compile(){this.regexes.length===0&&(this.exec=()=>null);const terminators=this.regexes.map(el=>el[1]);
-this.matcherRe=langRe(_rewriteBackreferences(terminators,{joinWith:"|"}),!0),this.lastIndex=0}exec(s2){this.matcherRe.lastIndex=this.lastIndex;const match=this.matcherRe.exec(s2);if(!match)return null;const i=match.findIndex((el,i2)=>i2>0&&el!==void 0),matchData=this.matchIndexes[i];return match.splice(0,i),Object.assign(match,matchData)}}class ResumableMultiRegex{constructor(){this.rules=[],this.multiRegexes=[],this.count=0,this.lastIndex=0,this.regexIndex=0}getMatcher(index2){if(this.multiRegexes[index2])
-return this.multiRegexes[index2];const matcher=new MultiRegex;return this.rules.slice(index2).forEach(([re2,opts])=>matcher.addRule(re2,opts)),matcher.compile(),this.multiRegexes[index2]=matcher,matcher}resumingScanAtSamePosition(){return this.regexIndex!==0}considerAll(){this.regexIndex=0}addRule(re2,opts){this.rules.push([re2,opts]),opts.type==="begin"&&this.count++}exec(s2){const m=this.getMatcher(this.regexIndex);m.lastIndex=this.lastIndex;let result=m.exec(s2);if(this.resumingScanAtSamePosition()&&
-!(result&&result.index===this.lastIndex)){const m2=this.getMatcher(0);m2.lastIndex=this.lastIndex+1,result=m2.exec(s2)}return result&&(this.regexIndex+=result.position+1,this.regexIndex===this.count&&this.considerAll()),result}}function buildModeRegex(mode){const mm=new ResumableMultiRegex;return mode.contains.forEach(term=>mm.addRule(term.begin,{rule:term,type:"begin"})),mode.terminatorEnd&&mm.addRule(mode.terminatorEnd,{type:"end"}),mode.illegal&&mm.addRule(mode.illegal,{type:"illegal"}),mm}function compileMode(mode,parent){
-const cmode=mode;if(mode.isCompiled)return cmode;[scopeClassName,compileMatch,MultiClass,beforeMatchExt].forEach(ext=>ext(mode,parent)),language2.compilerExtensions.forEach(ext=>ext(mode,parent)),mode.__beforeBegin=null,[beginKeywords,compileIllegal,compileRelevance].forEach(ext=>ext(mode,parent)),mode.isCompiled=!0;let keywordPattern=null;return typeof mode.keywords=="object"&&mode.keywords.$pattern&&(mode.keywords=Object.assign({},mode.keywords),keywordPattern=mode.keywords.$pattern,delete mode.
-keywords.$pattern),keywordPattern=keywordPattern||/\w+/,mode.keywords&&(mode.keywords=compileKeywords(mode.keywords,language2.case_insensitive)),cmode.keywordPatternRe=langRe(keywordPattern,!0),parent&&(mode.begin||(mode.begin=/\B|\b/),cmode.beginRe=langRe(cmode.begin),!mode.end&&!mode.endsWithParent&&(mode.end=/\B|\b/),mode.end&&(cmode.endRe=langRe(cmode.end)),cmode.terminatorEnd=source2(cmode.end)||"",mode.endsWithParent&&parent.terminatorEnd&&(cmode.terminatorEnd+=(mode.end?"|":"")+parent.terminatorEnd)),
-mode.illegal&&(cmode.illegalRe=langRe(mode.illegal)),mode.contains||(mode.contains=[]),mode.contains=[].concat(...mode.contains.map(function(c2){return expandOrCloneMode(c2==="self"?mode:c2)})),mode.contains.forEach(function(c2){compileMode(c2,cmode)}),mode.starts&&compileMode(mode.starts,parent),cmode.matcher=buildModeRegex(cmode),cmode}if(language2.compilerExtensions||(language2.compilerExtensions=[]),language2.contains&&language2.contains.includes("self"))throw new Error("ERR: contains `self`\
- is not supported at the top-level of a language.  See documentation.");return language2.classNameAliases=inherit$1(language2.classNameAliases||{}),compileMode(language2)}function dependencyOnParent(mode){return mode?mode.endsWithParent||dependencyOnParent(mode.starts):!1}function expandOrCloneMode(mode){return mode.variants&&!mode.cachedVariants&&(mode.cachedVariants=mode.variants.map(function(variant){return inherit$1(mode,{variants:null},variant)})),mode.cachedVariants?mode.cachedVariants:dependencyOnParent(
-mode)?inherit$1(mode,{starts:mode.starts?inherit$1(mode.starts):null}):Object.isFrozen(mode)?inherit$1(mode):mode}var version2="11.11.1";class HTMLInjectionError extends Error{constructor(reason,html2){super(reason),this.name="HTMLInjectionError",this.html=html2}}const escape2=escapeHTML,inherit=inherit$1,NO_MATCH=Symbol("nomatch"),MAX_KEYWORD_HITS=7,HLJS=function(hljs){const languages=Object.create(null),aliases=Object.create(null),plugins=[];let SAFE_MODE=!0;const LANGUAGE_NOT_FOUND="Could not\
- find the language '{}', did you forget to load/include a language module?",PLAINTEXT_LANGUAGE={disableAutodetect:!0,name:"Plain text",contains:[]};let options={ignoreUnescapedHTML:!1,throwUnescapedHTML:!1,noHighlightRe:/^(no-?highlight)$/i,languageDetectRe:/\blang(?:uage)?-([\w-]+)\b/i,classPrefix:"hljs-",cssSelector:"pre code",languages:null,__emitter:TokenTreeEmitter};function shouldNotHighlight(languageName){return options.noHighlightRe.test(languageName)}function blockLanguage(block2){let classes=block2.
-className+" ";classes+=block2.parentNode?block2.parentNode.className:"";const match=options.languageDetectRe.exec(classes);if(match){const language2=getLanguage(match[1]);return language2||(warn2(LANGUAGE_NOT_FOUND.replace("{}",match[1])),warn2("Falling back to no-highlight mode for this block.",block2)),language2?match[1]:"no-highlight"}return classes.split(/\s+/).find(_class=>shouldNotHighlight(_class)||getLanguage(_class))}function highlight2(codeOrLanguageName,optionsOrCode,ignoreIllegals){let code2="",
-languageName="";typeof optionsOrCode=="object"?(code2=codeOrLanguageName,ignoreIllegals=optionsOrCode.ignoreIllegals,languageName=optionsOrCode.language):(deprecated2("10.7.0","highlight(lang, code, ...args) has been deprecated."),deprecated2("10.7.0",`Please use highlight(code, options) instead.
+Lines"],POSITIVE_INTEGER_FIELDS=["agenticMaxTurns","agenticMaxToolPreviewLines"],SETTINGS_KEYS={THEME:"theme",API_KEY:"apiKey",SYSTEM_MESSAGE:"systemMessage",PASTE_LONG_TEXT_TO_FILE_LEN:"pasteLongTextToFileLen",COPY_TEXT_ATTACHMENTS_AS_PLAIN_TEXT:"copyTextAttachmentsAsPlainText",SEND_ON_ENTER:"sendOnEnter",ENABLE_CONTINUE_GENERATION:"enableContinueGeneration",PDF_AS_IMAGE:"pdfAsImage",ASK_FOR_TITLE_CONFIRMATION:"askForTitleConfirmation",TITLE_GENERATION_USE_FIRST_LINE:"titleGenerationUseFirstLin\
+e",SHOW_MESSAGE_STATS:"showMessageStats",SHOW_THOUGHT_IN_PROGRESS:"showThoughtInProgress",KEEP_STATS_VISIBLE:"keepStatsVisible",AUTO_MIC_ON_EMPTY:"autoMicOnEmpty",RENDER_USER_CONTENT_AS_MARKDOWN:"renderUserContentAsMarkdown",DISABLE_AUTO_SCROLL:"disableAutoScroll",ALWAYS_SHOW_SIDEBAR_ON_DESKTOP:"alwaysShowSidebarOnDesktop",FULL_HEIGHT_CODE_BLOCKS:"fullHeightCodeBlocks",SHOW_RAW_MODEL_NAMES:"showRawModelNames",TEMPERATURE:"temperature",DYNATEMP_RANGE:"dynatemp_range",DYNATEMP_EXPONENT:"dynatemp_e\
+xponent",TOP_K:"top_k",TOP_P:"top_p",MIN_P:"min_p",XTC_PROBABILITY:"xtc_probability",XTC_THRESHOLD:"xtc_threshold",TYP_P:"typ_p",MAX_TOKENS:"max_tokens",SAMPLERS:"samplers",BACKEND_SAMPLING:"backend_sampling",REPEAT_LAST_N:"repeat_last_n",REPEAT_PENALTY:"repeat_penalty",PRESENCE_PENALTY:"presence_penalty",FREQUENCY_PENALTY:"frequency_penalty",DRY_MULTIPLIER:"dry_multiplier",DRY_BASE:"dry_base",DRY_ALLOWED_LENGTH:"dry_allowed_length",DRY_PENALTY_LAST_N:"dry_penalty_last_n",AGENTIC_MAX_TURNS:"agen\
+ticMaxTurns",ALWAYS_SHOW_AGENTIC_TURNS:"alwaysShowAgenticTurns",AGENTIC_MAX_TOOL_PREVIEW_LINES:"agenticMaxToolPreviewLines",SHOW_TOOL_CALL_IN_PROGRESS:"showToolCallInProgress",PRE_ENCODE_CONVERSATION:"preEncodeConversation",DISABLE_REASONING_PARSING:"disableReasoningParsing",EXCLUDE_REASONING_FROM_CONTEXT:"excludeReasoningFromContext",SHOW_RAW_OUTPUT_SWITCH:"showRawOutputSwitch",CUSTOM:"custom"},SETTINGS_SECTION_TITLES={GENERAL:"General",DISPLAY:"Display",SAMPLING:"Sampling",PENALTIES:"Penalties",
+AGENTIC:"Agentic",TOOLS:"Tools",IMPORT_EXPORT:"Import/Export",DEVELOPER:"Developer"},SETTINGS_CHAT_SECTIONS=[{title:SETTINGS_SECTION_TITLES.GENERAL,slug:"general",icon:Sliders_vertical,fields:[{key:SETTINGS_KEYS.THEME,label:"Theme",type:SettingsFieldType.SELECT,options:SETTINGS_COLOR_MODES_CONFIG},{key:SETTINGS_KEYS.API_KEY,label:"API Key",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.SYSTEM_MESSAGE,label:"System Message",type:SettingsFieldType.TEXTAREA},{key:SETTINGS_KEYS.PASTE_LONG_TEXT_TO_FILE_LEN,
+label:"Paste long text to file length",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.SEND_ON_ENTER,label:"Send message on Enter",type:SettingsFieldType.CHECKBOX},{key:SETTINGS_KEYS.COPY_TEXT_ATTACHMENTS_AS_PLAIN_TEXT,label:"Copy text attachments as plain text",type:SettingsFieldType.CHECKBOX},{key:SETTINGS_KEYS.ENABLE_CONTINUE_GENERATION,label:'Enable "Continue" button',type:SettingsFieldType.CHECKBOX,isExperimental:!0},{key:SETTINGS_KEYS.PDF_AS_IMAGE,label:"Parse PDF as image",type:SettingsFieldType.
+CHECKBOX},{key:SETTINGS_KEYS.ASK_FOR_TITLE_CONFIRMATION,label:"Ask for confirmation before changing conversation title",type:SettingsFieldType.CHECKBOX},{key:SETTINGS_KEYS.TITLE_GENERATION_USE_FIRST_LINE,label:"Use first non-empty line for conversation title",type:SettingsFieldType.CHECKBOX}]},{title:SETTINGS_SECTION_TITLES.DISPLAY,slug:"display",icon:Monitor,fields:[{key:SETTINGS_KEYS.SHOW_MESSAGE_STATS,label:"Show message generation statistics",type:SettingsFieldType.CHECKBOX},{key:SETTINGS_KEYS.
+SHOW_THOUGHT_IN_PROGRESS,label:"Show thought in progress",type:SettingsFieldType.CHECKBOX},{key:SETTINGS_KEYS.SHOW_TOOL_CALL_IN_PROGRESS,label:"Show tool call in progress",type:SettingsFieldType.CHECKBOX},{key:SETTINGS_KEYS.KEEP_STATS_VISIBLE,label:"Keep stats visible after generation",type:SettingsFieldType.CHECKBOX},{key:SETTINGS_KEYS.AUTO_MIC_ON_EMPTY,label:"Show microphone on empty input",type:SettingsFieldType.CHECKBOX,isExperimental:!0},{key:SETTINGS_KEYS.RENDER_USER_CONTENT_AS_MARKDOWN,label:"\
+Render user content as Markdown",type:SettingsFieldType.CHECKBOX},{key:SETTINGS_KEYS.FULL_HEIGHT_CODE_BLOCKS,label:"Use full height code blocks",type:SettingsFieldType.CHECKBOX},{key:SETTINGS_KEYS.DISABLE_AUTO_SCROLL,label:"Disable automatic scroll",type:SettingsFieldType.CHECKBOX},{key:SETTINGS_KEYS.ALWAYS_SHOW_SIDEBAR_ON_DESKTOP,label:"Always show sidebar on desktop",type:SettingsFieldType.CHECKBOX},{key:SETTINGS_KEYS.SHOW_RAW_MODEL_NAMES,label:"Show raw model names",type:SettingsFieldType.CHECKBOX},
+{key:SETTINGS_KEYS.ALWAYS_SHOW_AGENTIC_TURNS,label:"Always show agentic turns in conversation",type:SettingsFieldType.CHECKBOX}]},{title:SETTINGS_SECTION_TITLES.SAMPLING,slug:"sampling",icon:Funnel,fields:[{key:SETTINGS_KEYS.TEMPERATURE,label:"Temperature",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.DYNATEMP_RANGE,label:"Dynamic temperature range",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.DYNATEMP_EXPONENT,label:"Dynamic temperature exponent",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.
+TOP_K,label:"Top K",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.TOP_P,label:"Top P",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.MIN_P,label:"Min P",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.XTC_PROBABILITY,label:"XTC probability",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.XTC_THRESHOLD,label:"XTC threshold",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.TYP_P,label:"Typical P",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.MAX_TOKENS,label:"Max tokens",type:SettingsFieldType.
+INPUT},{key:SETTINGS_KEYS.SAMPLERS,label:"Samplers",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.BACKEND_SAMPLING,label:"Backend sampling",type:SettingsFieldType.CHECKBOX}]},{title:SETTINGS_SECTION_TITLES.PENALTIES,slug:"penalties",icon:Triangle_alert,fields:[{key:SETTINGS_KEYS.REPEAT_LAST_N,label:"Repeat last N",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.REPEAT_PENALTY,label:"Repeat penalty",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.PRESENCE_PENALTY,label:"Presence penalty",type:SettingsFieldType.
+INPUT},{key:SETTINGS_KEYS.FREQUENCY_PENALTY,label:"Frequency penalty",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.DRY_MULTIPLIER,label:"DRY multiplier",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.DRY_BASE,label:"DRY base",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.DRY_ALLOWED_LENGTH,label:"DRY allowed length",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.DRY_PENALTY_LAST_N,label:"DRY penalty last N",type:SettingsFieldType.INPUT}]},{title:SETTINGS_SECTION_TITLES.AGENTIC,slug:"a\
+gentic",icon:List_restart,fields:[{key:SETTINGS_KEYS.AGENTIC_MAX_TURNS,label:"Agentic turns",type:SettingsFieldType.INPUT},{key:SETTINGS_KEYS.AGENTIC_MAX_TOOL_PREVIEW_LINES,label:"Max lines per tool preview",type:SettingsFieldType.INPUT}]},{title:SETTINGS_SECTION_TITLES.TOOLS,slug:"tools",icon:Pencil_ruler},{title:SETTINGS_SECTION_TITLES.IMPORT_EXPORT,slug:"import-export",icon:Database},{title:SETTINGS_SECTION_TITLES.DEVELOPER,slug:"developer",icon:Code,fields:[{key:SETTINGS_KEYS.PRE_ENCODE_CONVERSATION,
+label:"Pre-fill KV cache after response",type:SettingsFieldType.CHECKBOX},{key:SETTINGS_KEYS.DISABLE_REASONING_PARSING,label:"Disable reasoning content parsing",type:SettingsFieldType.CHECKBOX},{key:SETTINGS_KEYS.EXCLUDE_REASONING_FROM_CONTEXT,label:"Exclude reasoning from context",type:SettingsFieldType.CHECKBOX},{key:SETTINGS_KEYS.SHOW_RAW_OUTPUT_SWITCH,label:"Enable raw output toggle",type:SettingsFieldType.CHECKBOX},{key:SETTINGS_KEYS.CUSTOM,label:"Custom JSON",type:SettingsFieldType.TEXTAREA}]}];
+FileTypeAudio.MP3+"",FileExtensionAudio.MP3,MimeTypeAudio.MP3_MPEG,MimeTypeAudio.MP3,FileTypeAudio.WAV+"",FileExtensionAudio.WAV,MimeTypeAudio.WAV;FileTypeImage.JPEG+"",FileExtensionImage.JPG,FileExtensionImage.JPEG,MimeTypeImage.JPEG,FileTypeImage.PNG+"",FileExtensionImage.PNG,MimeTypeImage.PNG,FileTypeImage.GIF+"",FileExtensionImage.GIF,MimeTypeImage.GIF,FileTypeImage.WEBP+"",FileExtensionImage.WEBP,MimeTypeImage.WEBP,FileTypeImage.SVG+"",FileExtensionImage.SVG,MimeTypeImage.SVG;FileTypePdf.PDF+
+"",FileExtensionPdf.PDF,MimeTypeApplication.PDF;FileTypeText.PLAIN_TEXT+"",FileExtensionText.TXT,MimeTypeText.PLAIN,FileTypeText.MARKDOWN+"",FileExtensionText.MD,MimeTypeText.MARKDOWN,FileTypeText.ASCIIDOC+"",FileExtensionText.ADOC,MimeTypeText.ASCIIDOC,FileTypeText.JAVASCRIPT+"",FileExtensionText.JS,MimeTypeText.JAVASCRIPT,MimeTypeText.JAVASCRIPT_APP,FileTypeText.TYPESCRIPT+"",FileExtensionText.TS,MimeTypeText.TYPESCRIPT,FileTypeText.JSX+"",FileExtensionText.JSX,MimeTypeText.JSX,FileTypeText.TSX+
+"",FileExtensionText.TSX,MimeTypeText.TSX,FileTypeText.CSS+"",FileExtensionText.CSS,MimeTypeText.CSS,FileTypeText.HTML+"",FileExtensionText.HTML,FileExtensionText.HTM,MimeTypeText.HTML,FileTypeText.JSON+"",FileExtensionText.JSON,MimeTypeText.JSON,FileTypeText.XML+"",FileExtensionText.XML,MimeTypeText.XML_TEXT,MimeTypeText.XML_APP,FileTypeText.YAML+"",FileExtensionText.YAML,FileExtensionText.YML,MimeTypeText.YAML_TEXT,MimeTypeText.YAML_APP,FileTypeText.CSV+"",FileExtensionText.CSV,MimeTypeText.CSV,
+FileTypeText.LOG+"",FileExtensionText.LOG,MimeTypeText.PLAIN,FileTypeText.PYTHON+"",FileExtensionText.PY,MimeTypeText.PYTHON,FileTypeText.JAVA+"",FileExtensionText.JAVA,MimeTypeText.JAVA,FileTypeText.CPP+"",FileExtensionText.CPP,FileExtensionText.C,FileExtensionText.H,FileExtensionText.HPP,MimeTypeText.CPP_SRC,MimeTypeText.CPP_HDR,MimeTypeText.C_SRC,MimeTypeText.C_HDR,FileTypeText.PHP+"",FileExtensionText.PHP,MimeTypeText.PHP,FileTypeText.RUBY+"",FileExtensionText.RB,MimeTypeText.RUBY,FileTypeText.
+GO+"",FileExtensionText.GO,MimeTypeText.GO,FileTypeText.RUST+"",FileExtensionText.RS,MimeTypeText.RUST,FileTypeText.SHELL+"",FileExtensionText.SH,FileExtensionText.BAT,MimeTypeText.SHELL,MimeTypeText.BAT,FileTypeText.SQL+"",FileExtensionText.SQL,MimeTypeText.SQL,FileTypeText.R+"",FileExtensionText.R,MimeTypeText.R,FileTypeText.SCALA+"",FileExtensionText.SCALA,MimeTypeText.SCALA,FileTypeText.KOTLIN+"",FileExtensionText.KT,MimeTypeText.KOTLIN,FileTypeText.SWIFT+"",FileExtensionText.SWIFT,MimeTypeText.
+SWIFT,FileTypeText.DART+"",FileExtensionText.DART,MimeTypeText.DART,FileTypeText.VUE+"",FileExtensionText.VUE,MimeTypeText.VUE,FileTypeText.SVELTE+"",FileExtensionText.SVELTE,MimeTypeText.SVELTE,FileTypeText.LATEX+"",FileExtensionText.TEX,MimeTypeText.LATEX,MimeTypeText.TEX,MimeTypeText.TEX_APP,FileTypeText.BIBTEX+"",FileExtensionText.BIB,MimeTypeText.BIBTEX,FileTypeText.CUDA+"",FileExtensionText.CU,FileExtensionText.CUH,MimeTypeText.CUDA,FileTypeText.VULKAN+"",FileExtensionText.COMP,MimeTypeText.
+PLAIN,FileTypeText.HASKELL+"",FileExtensionText.HS,MimeTypeText.HASKELL,FileTypeText.CSHARP+"",FileExtensionText.CS,MimeTypeText.CSHARP,FileTypeText.PROPERTIES+"",FileExtensionText.PROPERTIES,MimeTypeText.PROPERTIES;const BR_PATTERN=/<br\s*\/?\s*>/gi,LIST_PATTERN=/^<ul>([\s\S]*)<\/ul>$/i,LI_PATTERN=/<li>([\s\S]*?)<\/li>/gi,TOOL_GROUP_LABELS={[ToolSource.BUILTIN]:"Built-in",[ToolSource.CUSTOM]:"JSON Schema"},TOOL_SERVER_LABELS={[ToolSource.BUILTIN]:"Built-in Tools",[ToolSource.CUSTOM]:"Custom Too\
+ls"},TOOLTIP_DELAY_DURATION=500;var root$1R=from_svg('<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 174 174" xmlns:xlink="http://www.w3.org/1999/xlink" fill="none" version="1.1"><g id="shape-320b5b95-d08d-8089-8007-585a8e498184"><defs><clipPath id="frame-clip-320b5b95-d08d-8089-8007-585a8e498184-render-1" class="frame-clip frame-clip-def"><rect rx="0" ry="0" x="0" y="0" width="174.00000000000045" height="174" transform="matrix(1.000000, 0.000000, 0.000000, 1.000000, 0.000000, 0.000000)"><\
+/rect></clipPath></defs><g class="frame-container-wrapper"><g class="frame-container-blur"><g class="frame-container-shadows"><g clip-path="url(#frame-clip-320b5b95-d08d-8089-8007-585a8e498184-render-1)" fill="none"><g class="fills" id="fills-320b5b95-d08d-8089-8007-585a8e498184"><rect rx="0" ry="0" x="0" y="0" width="174.00000000000045" height="174" transform="matrix(1.000000, 0.000000, 0.000000, 1.000000, 0.000000, 0.000000)" class="frame-background"></rect></g><g class="frame-children"><g id=\
+"shape-320b5b95-d08d-8089-8007-585a974337b1"><g class="fills" id="fills-320b5b95-d08d-8089-8007-585a974337b1"><path d="M15.5587158203125,81.5927734375L83.44091796875,13.7105712890625C92.813720703125,4.3380126953125,108.0096435546875,4.3380126953125,117.3817138671875,13.7105712890625L117.3817138671875,13.7105712890625C126.7547607421875,23.08306884765625,126.7547607421875,38.27911376953125,117.3817138671875,47.65167236328125L66.1168212890625,98.9169921875" fill="none" stroke-linecap="round" style=\
+"fill: none;"></path></g><g fill="none" stroke-linecap="round" id="strokes-b954dcef-3e3e-8015-8007-585acd4382b6-320b5b95-d08d-8089-8007-585a974337b1" class="strokes"><g class="stroke-shape"><path d="M15.5587158203125,81.5927734375L83.44091796875,13.7105712890625C92.813720703125,4.3380126953125,108.0096435546875,4.3380126953125,117.3817138671875,13.7105712890625L117.3817138671875,13.7105712890625C126.7547607421875,23.08306884765625,126.7547607421875,38.27911376953125,117.3817138671875,47.65167236\
+328125L66.1168212890625,98.9169921875" style="fill: none; stroke-width: 12; stroke: currentColor; stroke-opacity: 1;"></path></g></g></g><g id="shape-320b5b95-d08d-8089-8007-585a974337b2"><g class="fills" id="fills-320b5b95-d08d-8089-8007-585a974337b2"><path d="M66.5587158203125,98.26885986328125L117.1165771484375,47.7105712890625C126.489501953125,38.3380126953125,141.6854248046875,38.3380126953125,151.0584716796875,47.7105712890625L151.4114990234375,48.0640869140625C160.7845458984375,57.4367065\
+4296875,160.7845458984375,72.6326904296875,151.4114990234375,82.00518798828125L90.018310546875,143.39886474609375C86.8941650390625,146.52288818359375,86.8941650390625,151.587890625,90.018310546875,154.71185302734375L102.62451171875,167.31890869140625" fill="none" stroke-linecap="round" style="fill: none;"></path></g><g fill="none" stroke-linecap="round" id="strokes-b954dcef-3e3e-8015-8007-585acd447743-320b5b95-d08d-8089-8007-585a974337b2" class="strokes"><g class="stroke-shape"><path d="M66.5587\
+158203125,98.26885986328125L117.1165771484375,47.7105712890625C126.489501953125,38.3380126953125,141.6854248046875,38.3380126953125,151.0584716796875,47.7105712890625L151.4114990234375,48.0640869140625C160.7845458984375,57.43670654296875,160.7845458984375,72.6326904296875,151.4114990234375,82.00518798828125L90.018310546875,143.39886474609375C86.8941650390625,146.52288818359375,86.8941650390625,151.587890625,90.018310546875,154.71185302734375L102.62451171875,167.31890869140625" style="fill: none;\
+ stroke-width: 12; stroke: currentColor; stroke-opacity: 1;"></path></g></g></g><g id="shape-320b5b95-d08d-8089-8007-585a974337b3"><g class="fills" id="fills-320b5b95-d08d-8089-8007-585a974337b3"><path d="M99.79296875,30.68115234375L49.588134765625,80.8857421875C40.215576171875,90.258056640625,40.215576171875,105.45404052734375,49.588134765625,114.82708740234375L49.588134765625,114.82708740234375C58.9608154296875,124.19903564453125,74.1566162109375,124.19903564453125,83.529296875,114.82708740234\
+375L133.7340087890625,64.62225341796875" fill="none" stroke-linecap="round" style="fill: none;"></path></g><g fill="none" stroke-linecap="round" id="strokes-b954dcef-3e3e-8015-8007-585acd44c5c9-320b5b95-d08d-8089-8007-585a974337b3" class="strokes"><g class="stroke-shape"><path d="M99.79296875,30.68115234375L49.588134765625,80.8857421875C40.215576171875,90.258056640625,40.215576171875,105.45404052734375,49.588134765625,114.82708740234375L49.588134765625,114.82708740234375C58.9608154296875,124.199\
+03564453125,74.1566162109375,124.19903564453125,83.529296875,114.82708740234375L133.7340087890625,64.62225341796875" style="fill: none; stroke-width: 12; stroke: currentColor; stroke-opacity: 1;"></path></g></g></g></g></g></g></g></g></g></svg>');function McpLogo($$anchor,$$props){let className=prop($$props,"class",3,""),style2=prop($$props,"style",3,"");var svg2=root$1R();template_effect(()=>{set_class(svg2,0,clsx(className())),set_style(svg2,style2())}),append($$anchor,svg2)}const FORK_TREE_DEPTH_PADDING=8,
+SYSTEM_MESSAGE_PLACEHOLDER="System message",APP_NAME="llama.cpp",ICON_STRIP_TRANSITION_DURATION=150,ICON_STRIP_TRANSITION_DELAY_MULTIPLIER=50,SIDEBAR_ACTIONS_ITEMS=[{icon:Square_pen,tooltip:"New chat",route:"?new_chat=true#/",keys:["shift","cmd","o"]},{icon:Search,tooltip:"Search",keys:["cmd","k"]},{icon:McpLogo,tooltip:"MCP Servers",route:"#/settings/mcp",activeRouteId:"/settings/mcp"},{icon:Settings$1,tooltip:"Settings",route:"#/settings/chat/general",activeRoutePrefix:"/settings/chat"}],URI_SCHEME_SEPARATOR="\
+://",TEMPLATE_EXPRESSION_REGEX=/\{([+#./;?&]?)([^}]+)\}/g,URI_TEMPLATE_OPERATORS={RESERVED:"+",FRAGMENT:"#",PATH_SEGMENT:"/",LABEL:".",PATH_PARAM:";",FORM_QUERY:"?",FORM_CONTINUATION:"&"},URI_TEMPLATE_SEPARATORS={COMMA:",",SLASH:"/",PERIOD:".",SEMICOLON:";",QUERY_PREFIX:"?",QUERY_CONTINUATION:"&"},VARIABLE_EXPLODE_MODIFIER_REGEX=/[*]$/,VARIABLE_PREFIX_MODIFIER_REGEX=/:[\d]+$/,LEADING_SLASHES_REGEX=/^\/+/,DEFAULT_MOBILE_BREAKPOINT=768;class IsMobile extends MediaQuery{constructor(breakpoint=DEFAULT_MOBILE_BREAKPOINT){
+super(`max-width: ${breakpoint-1}px`)}}const SYNCABLE_PARAMETERS=[{key:"temperature",serverKey:"temperature",type:SyncableParameterType.NUMBER,canSync:!0},{key:"top_k",serverKey:"top_k",type:SyncableParameterType.NUMBER,canSync:!0},{key:"top_p",serverKey:"top_p",type:SyncableParameterType.NUMBER,canSync:!0},{key:"min_p",serverKey:"min_p",type:SyncableParameterType.NUMBER,canSync:!0},{key:"dynatemp_range",serverKey:"dynatemp_range",type:SyncableParameterType.NUMBER,canSync:!0},{key:"dynatemp_expo\
+nent",serverKey:"dynatemp_exponent",type:SyncableParameterType.NUMBER,canSync:!0},{key:"xtc_probability",serverKey:"xtc_probability",type:SyncableParameterType.NUMBER,canSync:!0},{key:"xtc_threshold",serverKey:"xtc_threshold",type:SyncableParameterType.NUMBER,canSync:!0},{key:"typ_p",serverKey:"typ_p",type:SyncableParameterType.NUMBER,canSync:!0},{key:"repeat_last_n",serverKey:"repeat_last_n",type:SyncableParameterType.NUMBER,canSync:!0},{key:"repeat_penalty",serverKey:"repeat_penalty",type:SyncableParameterType.
+NUMBER,canSync:!0},{key:"presence_penalty",serverKey:"presence_penalty",type:SyncableParameterType.NUMBER,canSync:!0},{key:"frequency_penalty",serverKey:"frequency_penalty",type:SyncableParameterType.NUMBER,canSync:!0},{key:"dry_multiplier",serverKey:"dry_multiplier",type:SyncableParameterType.NUMBER,canSync:!0},{key:"dry_base",serverKey:"dry_base",type:SyncableParameterType.NUMBER,canSync:!0},{key:"dry_allowed_length",serverKey:"dry_allowed_length",type:SyncableParameterType.NUMBER,canSync:!0},
+{key:"dry_penalty_last_n",serverKey:"dry_penalty_last_n",type:SyncableParameterType.NUMBER,canSync:!0},{key:"max_tokens",serverKey:"max_tokens",type:SyncableParameterType.NUMBER,canSync:!0},{key:"samplers",serverKey:"samplers",type:SyncableParameterType.STRING,canSync:!0},{key:"backend_sampling",serverKey:"backend_sampling",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"pasteLongTextToFileLen",serverKey:"pasteLongTextToFileLen",type:SyncableParameterType.NUMBER,canSync:!0},{key:"pdfAsImage",
+serverKey:"pdfAsImage",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"showThoughtInProgress",serverKey:"showThoughtInProgress",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"keepStatsVisible",serverKey:"keepStatsVisible",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"showMessageStats",serverKey:"showMessageStats",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"askForTitleConfirmation",serverKey:"askForTitleConfirmation",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"\
+titleGenerationUseFirstLine",serverKey:"titleGenerationUseFirstLine",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"disableAutoScroll",serverKey:"disableAutoScroll",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"renderUserContentAsMarkdown",serverKey:"renderUserContentAsMarkdown",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"autoMicOnEmpty",serverKey:"autoMicOnEmpty",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"pyInterpreterEnabled",serverKey:"pyInterpreterEnabled",type:SyncableParameterType.
+BOOLEAN,canSync:!0},{key:"enableContinueGeneration",serverKey:"enableContinueGeneration",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"fullHeightCodeBlocks",serverKey:"fullHeightCodeBlocks",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"systemMessage",serverKey:"systemMessage",type:SyncableParameterType.STRING,canSync:!0},{key:"showSystemMessage",serverKey:"showSystemMessage",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"theme",serverKey:"theme",type:SyncableParameterType.STRING,
+canSync:!0},{key:"copyTextAttachmentsAsPlainText",serverKey:"copyTextAttachmentsAsPlainText",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"showRawOutputSwitch",serverKey:"showRawOutputSwitch",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"alwaysShowSidebarOnDesktop",serverKey:"alwaysShowSidebarOnDesktop",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"showRawModelNames",serverKey:"showRawModelNames",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"mcpServers",serverKey:"mcp\
+Servers",type:SyncableParameterType.STRING,canSync:!0},{key:"agenticMaxTurns",serverKey:"agenticMaxTurns",type:SyncableParameterType.NUMBER,canSync:!0},{key:"agenticMaxToolPreviewLines",serverKey:"agenticMaxToolPreviewLines",type:SyncableParameterType.NUMBER,canSync:!0},{key:"showToolCallInProgress",serverKey:"showToolCallInProgress",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"alwaysShowAgenticTurns",serverKey:"alwaysShowAgenticTurns",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"\
+excludeReasoningFromContext",serverKey:"excludeReasoningFromContext",type:SyncableParameterType.BOOLEAN,canSync:!0},{key:"sendOnEnter",serverKey:"sendOnEnter",type:SyncableParameterType.BOOLEAN,canSync:!0}];class ParameterSyncService{static roundFloatingPoint(value){return normalizeFloatingPoint(value)}static extractServerDefaults(serverParams,webuiSettings){const extracted={};if(serverParams){for(const param of SYNCABLE_PARAMETERS)if(param.canSync&&param.serverKey in serverParams){const value=serverParams[param.
+serverKey];value!==void 0&&(extracted[param.key]=this.roundFloatingPoint(value))}serverParams.samplers&&Array.isArray(serverParams.samplers)&&(extracted.samplers=serverParams.samplers.join(";"))}if(webuiSettings){for(const param of SYNCABLE_PARAMETERS)if(param.canSync&&param.serverKey in webuiSettings){const value=webuiSettings[param.serverKey];value!==void 0&&(extracted[param.key]=this.roundFloatingPoint(value))}}return extracted}static mergeWithServerDefaults(currentSettings,serverDefaults,userOverrides=new Set){
+const merged={...currentSettings};for(const[key2,serverValue]of Object.entries(serverDefaults))userOverrides.has(key2)||(merged[key2]=this.roundFloatingPoint(serverValue));return merged}static getParameterInfo(key2,currentValue,propsDefaults,userOverrides){const hasPropsDefault=propsDefaults[key2]!==void 0,isUserOverride=userOverrides.has(key2),source2=isUserOverride?ParameterSource.CUSTOM:ParameterSource.DEFAULT;return{value:currentValue,source:source2,serverDefault:hasPropsDefault?propsDefaults[key2]:
+void 0,userOverride:isUserOverride?currentValue:void 0}}static canSyncParameter(key2){return SYNCABLE_PARAMETERS.some(param=>param.key===key2&&param.canSync)}static getSyncableParameterKeys(){return SYNCABLE_PARAMETERS.filter(param=>param.canSync).map(param=>param.key)}static validateServerParameter(key2,value){const param=SYNCABLE_PARAMETERS.find(p2=>p2.key===key2);if(!param)return!1;switch(param.type){case SyncableParameterType.NUMBER:return typeof value=="number"&&!isNaN(value);case SyncableParameterType.
+STRING:return typeof value=="string";case SyncableParameterType.BOOLEAN:return typeof value=="boolean";default:return!1}}static createParameterDiff(currentSettings,serverDefaults){const diff2={};for(const key2 of this.getSyncableParameterKeys()){const currentValue=currentSettings[key2],serverValue=serverDefaults[key2];serverValue!==void 0&&(diff2[key2]={current:currentValue,server:serverValue,differs:currentValue!==serverValue})}return diff2}}class PropsService{static async fetch(autoload=!1){const params={};
+return autoload||(params.autoload="false"),apiFetchWithParams("./props",params,{authOnly:!0})}static async fetchForModel(modelId,autoload=!1){const params={model:modelId};return autoload||(params.autoload="false"),apiFetchWithParams("./props",params,{authOnly:!0})}}class ServerStore{#props=state$1(null);get props(){return get$3(this.#props)}set props(value){set$1(this.#props,value,!0)}#loading=state$1(!1);get loading(){return get$3(this.#loading)}set loading(value){set$1(this.#loading,value,!0)}#error=state$1(
+null);get error(){return get$3(this.#error)}set error(value){set$1(this.#error,value,!0)}#role=state$1(null);get role(){return get$3(this.#role)}set role(value){set$1(this.#role,value,!0)}fetchPromise=null;get defaultParams(){return this.props?.default_generation_settings?.params||null}get contextSize(){const nCtx=this.props?.default_generation_settings?.n_ctx;return typeof nCtx=="number"?nCtx:null}get webuiSettings(){return this.props?.webui_settings}get isRouterMode(){return this.role===ServerRole.
+ROUTER}get isModelMode(){return this.role===ServerRole.MODEL}async fetch(){if(this.fetchPromise)return this.fetchPromise;this.loading=!0,this.error=null;const fetchPromise=(async()=>{try{const props=await PropsService.fetch();this.props=props,this.error=null,this.detectRole(props)}catch(error2){this.error=this.getErrorMessage(error2),console.error("Error fetching server properties:",error2)}finally{this.loading=!1,this.fetchPromise=null}})();this.fetchPromise=fetchPromise,await fetchPromise}getErrorMessage(error2){
+if(error2 instanceof Error){const message=error2.message||"";if(error2.name==="TypeError"&&message.includes("fetch"))return"Server is not running or unreachable";if(message.includes("ECONNREFUSED"))return"Connection refused - server may be offline";if(message.includes("ENOTFOUND"))return"Server not found - check server address";if(message.includes("ETIMEDOUT"))return"Request timed out";if(message.includes("503"))return"Server temporarily unavailable";if(message.includes("500"))return"Server erro\
+r - check server logs";if(message.includes("404"))return"Server endpoint not found";if(message.includes("403")||message.includes("401"))return"Access denied"}return"Failed to connect to server"}clear(){this.props=null,this.error=null,this.loading=!1,this.role=null,this.fetchPromise=null}detectRole(props){const newRole=props?.role===ServerRole.ROUTER?ServerRole.ROUTER:ServerRole.MODEL;this.role!==newRole&&(this.role=newRole,console.info(`Server running in ${newRole===ServerRole.ROUTER?"ROUTER":"M\
+ODEL"} mode`))}}const serverStore=new ServerStore,serverProps=()=>serverStore.props,serverLoading=()=>serverStore.loading,serverError=()=>serverStore.error,contextSize=()=>serverStore.contextSize,isRouterMode=()=>serverStore.isRouterMode;class SettingsStore{#config=state$1(proxy({...SETTING_CONFIG_DEFAULT}));get config(){return get$3(this.#config)}set config(value){set$1(this.#config,value,!0)}#theme=state$1("auto");get theme(){return get$3(this.#theme)}set theme(value){set$1(this.#theme,value,!0)}#isInitialized=state$1(
+!1);get isInitialized(){return get$3(this.#isInitialized)}set isInitialized(value){set$1(this.#isInitialized,value,!0)}#userOverrides=state$1(proxy(new Set));get userOverrides(){return get$3(this.#userOverrides)}set userOverrides(value){set$1(this.#userOverrides,value,!0)}getServerDefaults(){const serverParams=serverStore.defaultParams,webuiSettings=serverStore.webuiSettings;return ParameterSyncService.extractServerDefaults(serverParams,webuiSettings)}constructor(){this.initialize()}initialize(){
+try{this.loadConfig(),this.loadTheme(),this.isInitialized=!0}catch(error2){console.error("Failed to initialize settings store:",error2)}}loadConfig(){try{const storedConfigRaw=localStorage.getItem(CONFIG_LOCALSTORAGE_KEY),savedVal=JSON.parse(storedConfigRaw||"{}");this.config={...SETTING_CONFIG_DEFAULT,...savedVal},"sendOnEnter"in savedVal||new IsMobile().current&&(this.config.sendOnEnter=!1);const savedOverrides=JSON.parse(localStorage.getItem(USER_OVERRIDES_LOCALSTORAGE_KEY)||"[]");this.userOverrides=
+new Set(savedOverrides)}catch(error2){console.warn("Failed to parse config from localStorage, using defaults:",error2),this.config={...SETTING_CONFIG_DEFAULT},this.userOverrides=new Set}}loadTheme(){this.theme=localStorage.getItem("theme")||"auto"}updateConfig(key2,value){if(this.config[key2]=value,ParameterSyncService.canSyncParameter(key2)){const propsDefault=this.getServerDefaults()[key2];if(propsDefault!==void 0){const normalizedValue=normalizeFloatingPoint(value),normalizedDefault=normalizeFloatingPoint(
+propsDefault);normalizedValue===normalizedDefault?this.userOverrides.delete(key2):this.userOverrides.add(key2)}}this.saveConfig()}updateMultipleConfig(updates){Object.assign(this.config,updates);const propsDefaults=this.getServerDefaults();for(const[key2,value]of Object.entries(updates))if(ParameterSyncService.canSyncParameter(key2)){const propsDefault=propsDefaults[key2];if(propsDefault!==void 0){const normalizedValue=normalizeFloatingPoint(value),normalizedDefault=normalizeFloatingPoint(propsDefault);
+normalizedValue===normalizedDefault?this.userOverrides.delete(key2):this.userOverrides.add(key2)}}this.saveConfig()}saveConfig(){try{localStorage.setItem(CONFIG_LOCALSTORAGE_KEY,JSON.stringify(this.config)),localStorage.setItem(USER_OVERRIDES_LOCALSTORAGE_KEY,JSON.stringify(Array.from(this.userOverrides)))}catch(error2){console.error("Failed to save config to localStorage:",error2)}}updateTheme(newTheme){this.theme=newTheme,this.saveTheme()}saveTheme(){try{this.theme==="auto"?localStorage.removeItem(
+"theme"):localStorage.setItem("theme",this.theme)}catch(error2){console.error("Failed to save theme to localStorage:",error2)}}resetConfig(){this.config={...SETTING_CONFIG_DEFAULT},this.saveConfig()}resetTheme(){this.theme="auto",this.saveTheme()}resetAll(){this.resetConfig(),this.resetTheme()}resetParameterToServerDefault(key2){const serverDefaults=this.getServerDefaults(),webuiSettings=serverStore.webuiSettings;webuiSettings&&key2 in webuiSettings?setConfigValue(this.config,key2,webuiSettings[key2]):
+serverDefaults[key2]!==void 0?setConfigValue(this.config,key2,""):key2 in SETTING_CONFIG_DEFAULT&&setConfigValue(this.config,key2,getConfigValue(SETTING_CONFIG_DEFAULT,key2)),this.userOverrides.delete(key2),this.saveConfig()}syncWithServerDefaults(){const propsDefaults=this.getServerDefaults();if(Object.keys(propsDefaults).length===0)return;const webuiSettings=serverStore.webuiSettings,webuiSettingsKeys=new Set(webuiSettings?Object.keys(webuiSettings):[]);for(const[key2,propsValue]of Object.entries(
+propsDefaults)){const currentValue=getConfigValue(this.config,key2),normalizedCurrent=normalizeFloatingPoint(currentValue),normalizedDefault=normalizeFloatingPoint(propsValue);normalizedCurrent===normalizedDefault&&(this.userOverrides.delete(key2),!webuiSettingsKeys.has(key2)&&getConfigValue(SETTING_CONFIG_DEFAULT,key2)===void 0&&setConfigValue(this.config,key2,void 0))}if(webuiSettings)for(const[key2,value]of Object.entries(webuiSettings))!this.userOverrides.has(key2)&&value!==void 0&&setConfigValue(
+this.config,key2,value);this.saveConfig(),console.log("User overrides after sync:",Array.from(this.userOverrides))}forceSyncWithServerDefaults(){const propsDefaults=this.getServerDefaults(),webuiSettings=serverStore.webuiSettings;for(const key2 of ParameterSyncService.getSyncableParameterKeys())webuiSettings&&key2 in webuiSettings?setConfigValue(this.config,key2,webuiSettings[key2]):propsDefaults[key2]!==void 0?setConfigValue(this.config,key2,""):key2 in SETTING_CONFIG_DEFAULT&&setConfigValue(this.
+config,key2,getConfigValue(SETTING_CONFIG_DEFAULT,key2)),this.userOverrides.delete(key2);this.saveConfig()}getConfig(key2){return this.config[key2]}getAllConfig(){return{...this.config}}canSyncParameter(key2){return ParameterSyncService.canSyncParameter(key2)}getParameterInfo(key2){const propsDefaults=this.getServerDefaults(),currentValue=getConfigValue(this.config,key2);return ParameterSyncService.getParameterInfo(key2,currentValue??"",propsDefaults,this.userOverrides)}getParameterDiff(){const serverDefaults=this.
+getServerDefaults();if(Object.keys(serverDefaults).length===0)return{};const configAsRecord=configToParameterRecord(this.config,ParameterSyncService.getSyncableParameterKeys());return ParameterSyncService.createParameterDiff(configAsRecord,serverDefaults)}clearAllUserOverrides(){this.userOverrides.clear(),this.saveConfig(),console.log("Cleared all user overrides")}}const settingsStore=new SettingsStore,config$1=()=>settingsStore.config;function redactValue(value,showLastChars){return showLastChars?
+`....${value.slice(-showLastChars)}`:"[redacted]"}function getAuthHeaders(){const apiKey=config$1().apiKey?.toString().trim();return apiKey?{Authorization:`Bearer ${apiKey}`}:{}}function getJsonHeaders(){return{"Content-Type":"application/json",...getAuthHeaders()}}function sanitizeHeaders(headers,extraRedactedHeaders,partialRedactHeaders){if(!headers)return{};const normalized=new Headers(headers),sanitized={},redactedHeaders=new Set(Array.from(extraRedactedHeaders??[],header=>header.toLowerCase()));
+for(const[key2,value]of normalized.entries()){const normalizedKey=key2.toLowerCase(),partialChars=partialRedactHeaders?.get(normalizedKey);partialChars!==void 0?sanitized[key2]=redactValue(value,partialChars):REDACTED_HEADERS.has(normalizedKey)||redactedHeaders.has(normalizedKey)?sanitized[key2]=redactValue(value):sanitized[key2]=value}return sanitized}async function apiFetch(path2,options={}){const{authOnly=!1,headers:customHeaders,...fetchOptions}=options,headers={...authOnly?getAuthHeaders():
+getJsonHeaders(),...customHeaders},url2=path2.startsWith(UrlProtocol.HTTP)||path2.startsWith(UrlProtocol.HTTPS)?path2:`${base}${path2}`,response=await fetch(url2,{...fetchOptions,headers});if(!response.ok){const errorMessage=await parseErrorMessage(response);throw new Error(errorMessage)}return response.json()}async function apiFetchWithParams(basePath,params,options={}){const url2=new URL(basePath,window.location.href);for(const[key2,value]of Object.entries(params))value!=null&&url2.searchParams.
+set(key2,value);const{authOnly=!1,headers:customHeaders,...fetchOptions}=options,headers={...authOnly?getAuthHeaders():getJsonHeaders(),...customHeaders},response=await fetch(url2.toString(),{...fetchOptions,headers});if(!response.ok){const errorMessage=await parseErrorMessage(response);throw new Error(errorMessage)}return response.json()}async function apiPost(path2,body2,options={}){return apiFetch(path2,{method:"POST",body:JSON.stringify(body2),...options})}async function parseErrorMessage(response){
+try{const errorData=await response.json();if(errorData?.error?.message)return errorData.error.message;if(errorData?.error&&typeof errorData.error=="string")return errorData.error;if(errorData?.message)return errorData.message}catch{}return`Request failed: ${response.status} ${response.statusText}`}function error(status,body2){throw new HttpError(status,body2)}async function validateApiKey(fetch2){try{const apiKey=config$1().apiKey,headers={"Content-Type":"application/json"};apiKey&&(headers.Authorization=
+`Bearer ${apiKey}`);const response=await fetch2(`${base}/props`,{headers});if(!response.ok){if(response.status===401||response.status===403)throw error(401,"Access denied");console.warn(`Server responded with status ${response.status} during API key validation`);return}}catch(err){if(err&&typeof err=="object"&&"status"in err)throw err;console.warn("Cannot connect to server for API key validation:",err)}}function isMcpPrompt(item){return!!(item.attachment?.type===AttachmentType.MCP_PROMPT||item.uploadedFile?.
+type===SpecialFileType.MCP_PROMPT&&item.uploadedFile.mcpPrompt)}function isMcpResource(item){return item.attachment?.type===AttachmentType.MCP_RESOURCE}function getUploadedFileCategory$1(file){const categoryByMime=getFileTypeCategory(file.type);return categoryByMime||getFileTypeCategoryByExtension(file.name)}function getAttachmentDisplayItems(options){const{uploadedFiles=[],attachments=[]}=options,items2=[];for(const file of uploadedFiles)items2.push({id:file.id,name:file.name,size:file.size,preview:file.
+preview,isImage:getUploadedFileCategory$1(file)===FileTypeCategory.IMAGE,isLoading:file.isLoading,loadError:file.loadError,uploadedFile:file,textContent:file.textContent});for(const[index2,attachment]of attachments.entries()){const isImage2=isImageFile(attachment);items2.push({id:`attachment-${index2}`,name:attachment.name,size:"size"in attachment?attachment.size:void 0,preview:isImage2&&"base64Url"in attachment?attachment.base64Url:void 0,isImage:isImage2,attachment,attachmentIndex:index2,textContent:"\
+content"in attachment?attachment.content:void 0})}return items2.reverse()}function getUploadedFileCategory(uploadedFile){const categoryByMime=getFileTypeCategory(uploadedFile.type);return categoryByMime||getFileTypeCategoryByExtension(uploadedFile.name)}function isTextFile(attachment,uploadedFile){return uploadedFile?getUploadedFileCategory(uploadedFile)===FileTypeCategory.TEXT:attachment?attachment.type===AttachmentType.TEXT||attachment.type===AttachmentType.LEGACY_CONTEXT:!1}function isImageFile(attachment,uploadedFile){
+return uploadedFile?getUploadedFileCategory(uploadedFile)===FileTypeCategory.IMAGE:attachment?attachment.type===AttachmentType.IMAGE:!1}function isPdfFile$1(attachment,uploadedFile){return uploadedFile?getUploadedFileCategory(uploadedFile)===FileTypeCategory.PDF:attachment?attachment.type===AttachmentType.PDF:!1}function isAudioFile(attachment,uploadedFile){return uploadedFile?getUploadedFileCategory(uploadedFile)===FileTypeCategory.AUDIO:attachment?attachment.type===AttachmentType.AUDIO:!1}function autoResizeTextarea(textareaElement){
+textareaElement&&(textareaElement.style.height="1rem",textareaElement.style.height=textareaElement.scrollHeight+"px")}function findMessageById(messages,id2){if(id2)return messages.find(m=>m.id===id2)}function filterByLeafNodeId(messages,leafNodeId,includeRoot=!1){const result=[],nodeMap=new Map;for(const msg of messages)nodeMap.set(msg.id,msg);let startNode=nodeMap.get(leafNodeId);if(!startNode){let latestTime=-1;for(const msg of messages)msg.timestamp>latestTime&&(startNode=msg,latestTime=msg.timestamp)}
+let currentNode=startNode;for(;currentNode&&((currentNode.type!=="root"||includeRoot)&&result.push(currentNode),currentNode.parent!==null);)currentNode=nodeMap.get(currentNode.parent);return result.sort((a,b)=>a.role===MessageRole.SYSTEM&&b.role!==MessageRole.SYSTEM?-1:a.role!==MessageRole.SYSTEM&&b.role===MessageRole.SYSTEM?1:a.timestamp-b.timestamp),result}function findLeafNode(messages,messageId){const nodeMap=new Map;for(const msg of messages)nodeMap.set(msg.id,msg);let currentNode=nodeMap.get(
+messageId);for(;currentNode&&currentNode.children.length>0;){const lastChildId=currentNode.children[currentNode.children.length-1];currentNode=nodeMap.get(lastChildId)}return currentNode?.id??messageId}function findDescendantMessages(messages,messageId){const nodeMap=new Map;for(const msg of messages)nodeMap.set(msg.id,msg);const descendants=[],queue=[messageId];for(;queue.length>0;){const currentId=queue.shift(),currentNode=nodeMap.get(currentId);if(currentNode)for(const childId of currentNode.
+children)descendants.push(childId),queue.push(childId)}return descendants}function getMessageSiblings(messages,messageId){const nodeMap=new Map;for(const msg of messages)nodeMap.set(msg.id,msg);const message=nodeMap.get(messageId);if(!message)return null;if(message.parent===null)return{message,siblingIds:[messageId],currentIndex:0,totalSiblings:1};const parentNode=nodeMap.get(message.parent);if(!parentNode)return{message,siblingIds:[messageId],currentIndex:0,totalSiblings:1};const siblingIds=parentNode.
+children,siblingLeafIds=siblingIds.map(siblingId=>findLeafNode(messages,siblingId)),currentIndex=siblingIds.indexOf(messageId);return{message,siblingIds:siblingLeafIds,currentIndex,totalSiblings:siblingIds.length}}var core$5,hasRequiredCore$4;function requireCore$4(){if(hasRequiredCore$4)return core$5;hasRequiredCore$4=1;function deepFreeze(obj){return obj instanceof Map?obj.clear=obj.delete=obj.set=function(){throw new Error("map is read-only")}:obj instanceof Set&&(obj.add=obj.clear=obj.delete=
+function(){throw new Error("set is read-only")}),Object.freeze(obj),Object.getOwnPropertyNames(obj).forEach(name=>{const prop2=obj[name],type2=typeof prop2;(type2==="object"||type2==="function")&&!Object.isFrozen(prop2)&&deepFreeze(prop2)}),obj}class Response2{constructor(mode){mode.data===void 0&&(mode.data={}),this.data=mode.data,this.isMatchIgnored=!1}ignoreMatch(){this.isMatchIgnored=!0}}function escapeHTML(value){return value.replace(/&/g,"&amp;").replace(/</g,"&lt;").replace(/>/g,"&gt;").replace(
+/"/g,"&quot;").replace(/'/g,"&#x27;")}function inherit$1(original,...objects){const result=Object.create(null);for(const key2 in original)result[key2]=original[key2];return objects.forEach(function(obj){for(const key2 in obj)result[key2]=obj[key2]}),result}const SPAN_CLOSE="</span>",emitsWrappingTags=node2=>!!node2.scope,scopeToCSSClass=(name,{prefix})=>{if(name.startsWith("language:"))return name.replace("language:","language-");if(name.includes(".")){const pieces=name.split(".");return[`${prefix}${pieces.
+shift()}`,...pieces.map((x,i)=>`${x}${"_".repeat(i+1)}`)].join(" ")}return`${prefix}${name}`};class HTMLRenderer{constructor(parseTree3,options){this.buffer="",this.classPrefix=options.classPrefix,parseTree3.walk(this)}addText(text2){this.buffer+=escapeHTML(text2)}openNode(node2){if(!emitsWrappingTags(node2))return;const className=scopeToCSSClass(node2.scope,{prefix:this.classPrefix});this.span(className)}closeNode(node2){emitsWrappingTags(node2)&&(this.buffer+=SPAN_CLOSE)}value(){return this.buffer}span(className){
+this.buffer+=`<span class="${className}">`}}const newNode=(opts={})=>{const result={children:[]};return Object.assign(result,opts),result};class TokenTree{constructor(){this.rootNode=newNode(),this.stack=[this.rootNode]}get top(){return this.stack[this.stack.length-1]}get root(){return this.rootNode}add(node2){this.top.children.push(node2)}openNode(scope2){const node2=newNode({scope:scope2});this.add(node2),this.stack.push(node2)}closeNode(){if(this.stack.length>1)return this.stack.pop()}closeAllNodes(){
+for(;this.closeNode(););}toJSON(){return JSON.stringify(this.rootNode,null,4)}walk(builder){return this.constructor._walk(builder,this.rootNode)}static _walk(builder,node2){return typeof node2=="string"?builder.addText(node2):node2.children&&(builder.openNode(node2),node2.children.forEach(child2=>this._walk(builder,child2)),builder.closeNode(node2)),builder}static _collapse(node2){typeof node2!="string"&&node2.children&&(node2.children.every(el=>typeof el=="string")?node2.children=[node2.children.
+join("")]:node2.children.forEach(child2=>{TokenTree._collapse(child2)}))}}class TokenTreeEmitter extends TokenTree{constructor(options){super(),this.options=options}addText(text2){text2!==""&&this.add(text2)}startScope(scope2){this.openNode(scope2)}endScope(){this.closeNode()}__addSublanguage(emitter,name){const node2=emitter.root;name&&(node2.scope=`language:${name}`),this.add(node2)}toHTML(){return new HTMLRenderer(this,this.options).value()}finalize(){return this.closeAllNodes(),!0}}function source2(re2){
+return re2?typeof re2=="string"?re2:re2.source:null}function lookahead2(re2){return concat2("(?=",re2,")")}function anyNumberOfTimes(re2){return concat2("(?:",re2,")*")}function optional2(re2){return concat2("(?:",re2,")?")}function concat2(...args){return args.map(x=>source2(x)).join("")}function stripOptionsFromArgs2(args){const opts=args[args.length-1];return typeof opts=="object"&&opts.constructor===Object?(args.splice(args.length-1,1),opts):{}}function either2(...args){return"("+(stripOptionsFromArgs2(
+args).capture?"":"?:")+args.map(x=>source2(x)).join("|")+")"}function countMatchGroups(re2){return new RegExp(re2.toString()+"|").exec("").length-1}function startsWith(re2,lexeme){const match=re2&&re2.exec(lexeme);return match&&match.index===0}const BACKREF_RE=/\[(?:[^\\\]]|\\.)*\]|\(\??|\\([1-9][0-9]*)|\\./;function _rewriteBackreferences(regexps,{joinWith}){let numCaptures=0;return regexps.map(regex=>{numCaptures+=1;const offset2=numCaptures;let re2=source2(regex),out="";for(;re2.length>0;){const match=BACKREF_RE.
+exec(re2);if(!match){out+=re2;break}out+=re2.substring(0,match.index),re2=re2.substring(match.index+match[0].length),match[0][0]==="\\"&&match[1]?out+="\\"+String(Number(match[1])+offset2):(out+=match[0],match[0]==="("&&numCaptures++)}return out}).map(re2=>`(${re2})`).join(joinWith)}const MATCH_NOTHING_RE=/\b\B/,IDENT_RE2="[a-zA-Z]\\w*",UNDERSCORE_IDENT_RE="[a-zA-Z_]\\w*",NUMBER_RE="\\b\\d+(\\.\\d+)?",C_NUMBER_RE="(-?)(\\b0[xX][a-fA-F0-9]+|(\\b\\d+(\\.\\d*)?|\\.\\d+)([eE][-+]?\\d+)?)",BINARY_NUMBER_RE="\
+\\b(0b[01]+)",RE_STARTERS_RE="!|!=|!==|%|%=|&|&&|&=|\\*|\\*=|\\+|\\+=|,|-|-=|/=|/|:|;|<<|<<=|<=|<|===|==|=|>>>=|>>=|>=|>>>|>>|>|\\?|\\[|\\{|\\(|\\^|\\^=|\\||\\|=|\\|\\||~",SHEBANG=(opts={})=>{const beginShebang=/^#![ ]*\//;return opts.binary&&(opts.begin=concat2(beginShebang,/.*\b/,opts.binary,/\b.*/)),inherit$1({scope:"meta",begin:beginShebang,end:/$/,relevance:0,"on:begin":(m,resp)=>{m.index!==0&&resp.ignoreMatch()}},opts)},BACKSLASH_ESCAPE={begin:"\\\\[\\s\\S]",relevance:0},APOS_STRING_MODE={
+scope:"string",begin:"'",end:"'",illegal:"\\n",contains:[BACKSLASH_ESCAPE]},QUOTE_STRING_MODE={scope:"string",begin:'"',end:'"',illegal:"\\n",contains:[BACKSLASH_ESCAPE]},PHRASAL_WORDS_MODE={begin:/\b(a|an|the|are|I'm|isn't|don't|doesn't|won't|but|just|should|pretty|simply|enough|gonna|going|wtf|so|such|will|you|your|they|like|more)\b/},COMMENT=function(begin,end,modeOptions={}){const mode=inherit$1({scope:"comment",begin,end,contains:[]},modeOptions);mode.contains.push({scope:"doctag",begin:"[ \
+]*(?=(TODO|FIXME|NOTE|BUG|OPTIMIZE|HACK|XXX):)",end:/(TODO|FIXME|NOTE|BUG|OPTIMIZE|HACK|XXX):/,excludeBegin:!0,relevance:0});const ENGLISH_WORD=either2("I","a","is","so","us","to","at","if","in","it","on",/[A-Za-z]+['](d|ve|re|ll|t|s|n)/,/[A-Za-z]+[-][a-z]+/,/[A-Za-z][a-z]{2,}/);return mode.contains.push({begin:concat2(/[ ]+/,"(",ENGLISH_WORD,/[.]?[:]?([.][ ]|[ ])/,"){3}")}),mode},C_LINE_COMMENT_MODE=COMMENT("//","$"),C_BLOCK_COMMENT_MODE=COMMENT("/\\*","\\*/"),HASH_COMMENT_MODE=COMMENT("#","$"),
+NUMBER_MODE={scope:"number",begin:NUMBER_RE,relevance:0},C_NUMBER_MODE={scope:"number",begin:C_NUMBER_RE,relevance:0},BINARY_NUMBER_MODE={scope:"number",begin:BINARY_NUMBER_RE,relevance:0},REGEXP_MODE={scope:"regexp",begin:/\/(?=[^/\n]*\/)/,end:/\/[gimuy]*/,contains:[BACKSLASH_ESCAPE,{begin:/\[/,end:/\]/,relevance:0,contains:[BACKSLASH_ESCAPE]}]},TITLE_MODE={scope:"title",begin:IDENT_RE2,relevance:0},UNDERSCORE_TITLE_MODE={scope:"title",begin:UNDERSCORE_IDENT_RE,relevance:0},METHOD_GUARD={begin:"\
+\\.\\s*"+UNDERSCORE_IDENT_RE,relevance:0};var MODES2=Object.freeze({__proto__:null,APOS_STRING_MODE,BACKSLASH_ESCAPE,BINARY_NUMBER_MODE,BINARY_NUMBER_RE,COMMENT,C_BLOCK_COMMENT_MODE,C_LINE_COMMENT_MODE,C_NUMBER_MODE,C_NUMBER_RE,END_SAME_AS_BEGIN:function(mode){return Object.assign(mode,{"on:begin":(m,resp)=>{resp.data._beginMatch=m[1]},"on:end":(m,resp)=>{resp.data._beginMatch!==m[1]&&resp.ignoreMatch()}})},HASH_COMMENT_MODE,IDENT_RE:IDENT_RE2,MATCH_NOTHING_RE,METHOD_GUARD,NUMBER_MODE,NUMBER_RE,
+PHRASAL_WORDS_MODE,QUOTE_STRING_MODE,REGEXP_MODE,RE_STARTERS_RE,SHEBANG,TITLE_MODE,UNDERSCORE_IDENT_RE,UNDERSCORE_TITLE_MODE});function skipIfHasPrecedingDot(match,response){match.input[match.index-1]==="."&&response.ignoreMatch()}function scopeClassName(mode,_parent){mode.className!==void 0&&(mode.scope=mode.className,delete mode.className)}function beginKeywords(mode,parent){parent&&mode.beginKeywords&&(mode.begin="\\b("+mode.beginKeywords.split(" ").join("|")+")(?!\\.)(?=\\b|\\s)",mode.__beforeBegin=
+skipIfHasPrecedingDot,mode.keywords=mode.keywords||mode.beginKeywords,delete mode.beginKeywords,mode.relevance===void 0&&(mode.relevance=0))}function compileIllegal(mode,_parent){Array.isArray(mode.illegal)&&(mode.illegal=either2(...mode.illegal))}function compileMatch(mode,_parent){if(mode.match){if(mode.begin||mode.end)throw new Error("begin & end are not supported with match");mode.begin=mode.match,delete mode.match}}function compileRelevance(mode,_parent){mode.relevance===void 0&&(mode.relevance=
+1)}const beforeMatchExt=(mode,parent)=>{if(!mode.beforeMatch)return;if(mode.starts)throw new Error("beforeMatch cannot be used with starts");const originalMode=Object.assign({},mode);Object.keys(mode).forEach(key2=>{delete mode[key2]}),mode.keywords=originalMode.keywords,mode.begin=concat2(originalMode.beforeMatch,lookahead2(originalMode.begin)),mode.starts={relevance:0,contains:[Object.assign(originalMode,{endsParent:!0})]},mode.relevance=0,delete originalMode.beforeMatch},COMMON_KEYWORDS=["of",
+"and","for","in","not","or","if","then","parent","list","value"],DEFAULT_KEYWORD_SCOPE="keyword";function compileKeywords(rawKeywords,caseInsensitive,scopeName=DEFAULT_KEYWORD_SCOPE){const compiledKeywords=Object.create(null);return typeof rawKeywords=="string"?compileList(scopeName,rawKeywords.split(" ")):Array.isArray(rawKeywords)?compileList(scopeName,rawKeywords):Object.keys(rawKeywords).forEach(function(scopeName2){Object.assign(compiledKeywords,compileKeywords(rawKeywords[scopeName2],caseInsensitive,
+scopeName2))}),compiledKeywords;function compileList(scopeName2,keywordList){caseInsensitive&&(keywordList=keywordList.map(x=>x.toLowerCase())),keywordList.forEach(function(keyword2){const pair=keyword2.split("|");compiledKeywords[pair[0]]=[scopeName2,scoreForKeyword(pair[0],pair[1])]})}}function scoreForKeyword(keyword2,providedScore){return providedScore?Number(providedScore):commonKeyword(keyword2)?0:1}function commonKeyword(keyword2){return COMMON_KEYWORDS.includes(keyword2.toLowerCase())}const seenDeprecations={},
+error2=message=>{console.error(message)},warn2=(message,...args)=>{console.log(`WARN: ${message}`,...args)},deprecated2=(version3,message)=>{seenDeprecations[`${version3}/${message}`]||(console.log(`Deprecated as of ${version3}. ${message}`),seenDeprecations[`${version3}/${message}`]=!0)},MultiClassError=new Error;function remapScopeNames(mode,regexes,{key:key2}){let offset2=0;const scopeNames=mode[key2],emit={},positions={};for(let i=1;i<=regexes.length;i++)positions[i+offset2]=scopeNames[i],emit[i+
+offset2]=!0,offset2+=countMatchGroups(regexes[i-1]);mode[key2]=positions,mode[key2]._emit=emit,mode[key2]._multi=!0}function beginMultiClass(mode){if(Array.isArray(mode.begin)){if(mode.skip||mode.excludeBegin||mode.returnBegin)throw error2("skip, excludeBegin, returnBegin not compatible with beginScope: {}"),MultiClassError;if(typeof mode.beginScope!="object"||mode.beginScope===null)throw error2("beginScope must be object"),MultiClassError;remapScopeNames(mode,mode.begin,{key:"beginScope"}),mode.
+begin=_rewriteBackreferences(mode.begin,{joinWith:""})}}function endMultiClass(mode){if(Array.isArray(mode.end)){if(mode.skip||mode.excludeEnd||mode.returnEnd)throw error2("skip, excludeEnd, returnEnd not compatible with endScope: {}"),MultiClassError;if(typeof mode.endScope!="object"||mode.endScope===null)throw error2("endScope must be object"),MultiClassError;remapScopeNames(mode,mode.end,{key:"endScope"}),mode.end=_rewriteBackreferences(mode.end,{joinWith:""})}}function scopeSugar(mode){mode.
+scope&&typeof mode.scope=="object"&&mode.scope!==null&&(mode.beginScope=mode.scope,delete mode.scope)}function MultiClass(mode){scopeSugar(mode),typeof mode.beginScope=="string"&&(mode.beginScope={_wrap:mode.beginScope}),typeof mode.endScope=="string"&&(mode.endScope={_wrap:mode.endScope}),beginMultiClass(mode),endMultiClass(mode)}function compileLanguage(language2){function langRe(value,global2){return new RegExp(source2(value),"m"+(language2.case_insensitive?"i":"")+(language2.unicodeRegex?"u":
+"")+(global2?"g":""))}class MultiRegex{constructor(){this.matchIndexes={},this.regexes=[],this.matchAt=1,this.position=0}addRule(re2,opts){opts.position=this.position++,this.matchIndexes[this.matchAt]=opts,this.regexes.push([opts,re2]),this.matchAt+=countMatchGroups(re2)+1}compile(){this.regexes.length===0&&(this.exec=()=>null);const terminators=this.regexes.map(el=>el[1]);this.matcherRe=langRe(_rewriteBackreferences(terminators,{joinWith:"|"}),!0),this.lastIndex=0}exec(s2){this.matcherRe.lastIndex=
+this.lastIndex;const match=this.matcherRe.exec(s2);if(!match)return null;const i=match.findIndex((el,i2)=>i2>0&&el!==void 0),matchData=this.matchIndexes[i];return match.splice(0,i),Object.assign(match,matchData)}}class ResumableMultiRegex{constructor(){this.rules=[],this.multiRegexes=[],this.count=0,this.lastIndex=0,this.regexIndex=0}getMatcher(index2){if(this.multiRegexes[index2])return this.multiRegexes[index2];const matcher=new MultiRegex;return this.rules.slice(index2).forEach(([re2,opts])=>matcher.
+addRule(re2,opts)),matcher.compile(),this.multiRegexes[index2]=matcher,matcher}resumingScanAtSamePosition(){return this.regexIndex!==0}considerAll(){this.regexIndex=0}addRule(re2,opts){this.rules.push([re2,opts]),opts.type==="begin"&&this.count++}exec(s2){const m=this.getMatcher(this.regexIndex);m.lastIndex=this.lastIndex;let result=m.exec(s2);if(this.resumingScanAtSamePosition()&&!(result&&result.index===this.lastIndex)){const m2=this.getMatcher(0);m2.lastIndex=this.lastIndex+1,result=m2.exec(s2)}
+return result&&(this.regexIndex+=result.position+1,this.regexIndex===this.count&&this.considerAll()),result}}function buildModeRegex(mode){const mm=new ResumableMultiRegex;return mode.contains.forEach(term=>mm.addRule(term.begin,{rule:term,type:"begin"})),mode.terminatorEnd&&mm.addRule(mode.terminatorEnd,{type:"end"}),mode.illegal&&mm.addRule(mode.illegal,{type:"illegal"}),mm}function compileMode(mode,parent){const cmode=mode;if(mode.isCompiled)return cmode;[scopeClassName,compileMatch,MultiClass,
+beforeMatchExt].forEach(ext=>ext(mode,parent)),language2.compilerExtensions.forEach(ext=>ext(mode,parent)),mode.__beforeBegin=null,[beginKeywords,compileIllegal,compileRelevance].forEach(ext=>ext(mode,parent)),mode.isCompiled=!0;let keywordPattern=null;return typeof mode.keywords=="object"&&mode.keywords.$pattern&&(mode.keywords=Object.assign({},mode.keywords),keywordPattern=mode.keywords.$pattern,delete mode.keywords.$pattern),keywordPattern=keywordPattern||/\w+/,mode.keywords&&(mode.keywords=compileKeywords(
+mode.keywords,language2.case_insensitive)),cmode.keywordPatternRe=langRe(keywordPattern,!0),parent&&(mode.begin||(mode.begin=/\B|\b/),cmode.beginRe=langRe(cmode.begin),!mode.end&&!mode.endsWithParent&&(mode.end=/\B|\b/),mode.end&&(cmode.endRe=langRe(cmode.end)),cmode.terminatorEnd=source2(cmode.end)||"",mode.endsWithParent&&parent.terminatorEnd&&(cmode.terminatorEnd+=(mode.end?"|":"")+parent.terminatorEnd)),mode.illegal&&(cmode.illegalRe=langRe(mode.illegal)),mode.contains||(mode.contains=[]),mode.
+contains=[].concat(...mode.contains.map(function(c2){return expandOrCloneMode(c2==="self"?mode:c2)})),mode.contains.forEach(function(c2){compileMode(c2,cmode)}),mode.starts&&compileMode(mode.starts,parent),cmode.matcher=buildModeRegex(cmode),cmode}if(language2.compilerExtensions||(language2.compilerExtensions=[]),language2.contains&&language2.contains.includes("self"))throw new Error("ERR: contains `self` is not supported at the top-level of a language.  See documentation.");return language2.classNameAliases=
+inherit$1(language2.classNameAliases||{}),compileMode(language2)}function dependencyOnParent(mode){return mode?mode.endsWithParent||dependencyOnParent(mode.starts):!1}function expandOrCloneMode(mode){return mode.variants&&!mode.cachedVariants&&(mode.cachedVariants=mode.variants.map(function(variant){return inherit$1(mode,{variants:null},variant)})),mode.cachedVariants?mode.cachedVariants:dependencyOnParent(mode)?inherit$1(mode,{starts:mode.starts?inherit$1(mode.starts):null}):Object.isFrozen(mode)?
+inherit$1(mode):mode}var version2="11.11.1";class HTMLInjectionError extends Error{constructor(reason,html2){super(reason),this.name="HTMLInjectionError",this.html=html2}}const escape2=escapeHTML,inherit=inherit$1,NO_MATCH=Symbol("nomatch"),MAX_KEYWORD_HITS=7,HLJS=function(hljs){const languages=Object.create(null),aliases=Object.create(null),plugins=[];let SAFE_MODE=!0;const LANGUAGE_NOT_FOUND="Could not find the language '{}', did you forget to load/include a language module?",PLAINTEXT_LANGUAGE={
+disableAutodetect:!0,name:"Plain text",contains:[]};let options={ignoreUnescapedHTML:!1,throwUnescapedHTML:!1,noHighlightRe:/^(no-?highlight)$/i,languageDetectRe:/\blang(?:uage)?-([\w-]+)\b/i,classPrefix:"hljs-",cssSelector:"pre code",languages:null,__emitter:TokenTreeEmitter};function shouldNotHighlight(languageName){return options.noHighlightRe.test(languageName)}function blockLanguage(block2){let classes=block2.className+" ";classes+=block2.parentNode?block2.parentNode.className:"";const match=options.
+languageDetectRe.exec(classes);if(match){const language2=getLanguage(match[1]);return language2||(warn2(LANGUAGE_NOT_FOUND.replace("{}",match[1])),warn2("Falling back to no-highlight mode for this block.",block2)),language2?match[1]:"no-highlight"}return classes.split(/\s+/).find(_class=>shouldNotHighlight(_class)||getLanguage(_class))}function highlight2(codeOrLanguageName,optionsOrCode,ignoreIllegals){let code2="",languageName="";typeof optionsOrCode=="object"?(code2=codeOrLanguageName,ignoreIllegals=
+optionsOrCode.ignoreIllegals,languageName=optionsOrCode.language):(deprecated2("10.7.0","highlight(lang, code, ...args) has been deprecated."),deprecated2("10.7.0",`Please use highlight(code, options) instead.
 https://github.com/highlightjs/highlight.js/issues/2277`),languageName=codeOrLanguageName,code2=optionsOrCode),ignoreIllegals===void 0&&(ignoreIllegals=!0);const context={code:code2,language:languageName};fire("before:highlight",context);const result=context.result?context.result:_highlight(context.language,context.code,ignoreIllegals);return result.code=context.code,fire("after:highlight",result),result}function _highlight(languageName,codeToHighlight,ignoreIllegals,continuation){const keywordHits=Object.
 create(null);function keywordData(mode,matchText){return mode.keywords[matchText]}function processKeywords(){if(!top.keywords){emitter.addText(modeBuffer);return}let lastIndex=0;top.keywordPatternRe.lastIndex=0;let match=top.keywordPatternRe.exec(modeBuffer),buf="";for(;match;){buf+=modeBuffer.substring(lastIndex,match.index);const word=language2.case_insensitive?match[0].toLowerCase():match[0],data=keywordData(top,word);if(data){const[kind,keywordRelevance]=data;if(emitter.addText(buf),buf="",keywordHits[word]=
 (keywordHits[word]||0)+1,keywordHits[word]<=MAX_KEYWORD_HITS&&(relevance+=keywordRelevance),kind.startsWith("_"))buf+=match[0];else{const cssClass=language2.classNameAliases[kind]||kind;emitKeyword(match[0],cssClass)}}else buf+=match[0];lastIndex=top.keywordPatternRe.lastIndex,match=top.keywordPatternRe.exec(modeBuffer)}buf+=modeBuffer.substring(lastIndex),emitter.addText(buf)}function processSubLanguage(){if(modeBuffer==="")return;let result2=null;if(typeof top.subLanguage=="string"){if(!languages[top.
diff --git a/tools/server/webui/src/lib/constants/settings-sections.ts b/tools/server/webui/src/lib/constants/settings-sections.ts
index 1f2964c0ef1a..d88638ce6ca3 100644
--- a/tools/server/webui/src/lib/constants/settings-sections.ts
+++ b/tools/server/webui/src/lib/constants/settings-sections.ts
@@ -65,6 +65,11 @@ export const SETTINGS_CHAT_SECTIONS: SettingsSection[] = [
 				label: 'Paste long text to file length',
 				type: SettingsFieldType.INPUT
 			},
+			{
+				key: SETTINGS_KEYS.SEND_ON_ENTER,
+				label: 'Send message on Enter',
+				type: SettingsFieldType.CHECKBOX
+			},
 			{
 				key: SETTINGS_KEYS.COPY_TEXT_ATTACHMENTS_AS_PLAIN_TEXT,
 				label: 'Copy text attachments as plain text',
@@ -85,6 +90,11 @@ export const SETTINGS_CHAT_SECTIONS: SettingsSection[] = [
 				key: SETTINGS_KEYS.ASK_FOR_TITLE_CONFIRMATION,
 				label: 'Ask for confirmation before changing conversation title',
 				type: SettingsFieldType.CHECKBOX
+			},
+			{
+				key: SETTINGS_KEYS.TITLE_GENERATION_USE_FIRST_LINE,
+				label: 'Use first non-empty line for conversation title',
+				type: SettingsFieldType.CHECKBOX
 			}
 		]
 	},
@@ -297,6 +307,11 @@ export const SETTINGS_CHAT_SECTIONS: SettingsSection[] = [
 		slug: 'developer',
 		icon: Code,
 		fields: [
+			{
+				key: SETTINGS_KEYS.PRE_ENCODE_CONVERSATION,
+				label: 'Pre-fill KV cache after response',
+				type: SettingsFieldType.CHECKBOX
+			},
 			{
 				key: SETTINGS_KEYS.DISABLE_REASONING_PARSING,
 				label: 'Disable reasoning content parsing',

From c84e6d6db5c637ad8aacfdd41cf96fc1d2b85cec Mon Sep 17 00:00:00 2001
From: Evan Huus <eapache@gmail.com>
Date: Mon, 4 May 2026 06:19:41 -0400
Subject: [PATCH 06/13] server: Add a simple get_datetime server tool (#22649)

---
 common/arg.cpp                |  2 +-
 tools/server/README.md        |  2 +-
 tools/server/server-tools.cpp | 30 ++++++++++++++++++++++++++++++
 3 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index dd8ce40a6159..8f54ee38c1b8 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2864,7 +2864,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"--tools"}, "TOOL1,TOOL2,...",
         "experimental: whether to enable built-in tools for AI agents - do not enable in untrusted environments (default: no tools)\n"
         "specify \"all\" to enable all tools\n"
-        "available tools: read_file, file_glob_search, grep_search, exec_shell_command, write_file, edit_file, apply_diff",
+        "available tools: read_file, file_glob_search, grep_search, exec_shell_command, write_file, edit_file, apply_diff, get_datetime",
         [](common_params & params, const std::string & value) {
             params.server_tools = parse_csv_row(value);
         }
diff --git a/tools/server/README.md b/tools/server/README.md
index eda875951322..a0cfdbb6fecc 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -191,7 +191,7 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `--webui-config JSON` | JSON that provides default WebUI settings (overrides WebUI defaults)<br/>(env: LLAMA_ARG_WEBUI_CONFIG) |
 | `--webui-config-file PATH` | JSON file that provides default WebUI settings (overrides WebUI defaults)<br/>(env: LLAMA_ARG_WEBUI_CONFIG_FILE) |
 | `--webui-mcp-proxy, --no-webui-mcp-proxy` | experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)<br/>(env: LLAMA_ARG_WEBUI_MCP_PROXY) |
-| `--tools TOOL1,TOOL2,...` | experimental: whether to enable built-in tools for AI agents - do not enable in untrusted environments (default: no tools)<br/>specify "all" to enable all tools<br/>available tools: read_file, file_glob_search, grep_search, exec_shell_command, write_file, edit_file, apply_diff<br/>(env: LLAMA_ARG_TOOLS) |
+| `--tools TOOL1,TOOL2,...` | experimental: whether to enable built-in tools for AI agents - do not enable in untrusted environments (default: no tools)<br/>specify "all" to enable all tools<br/>available tools: read_file, file_glob_search, grep_search, exec_shell_command, write_file, edit_file, apply_diff, get_datetime<br/>(env: LLAMA_ARG_TOOLS) |
 | `--webui, --no-webui` | whether to enable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_WEBUI) |
 | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
 | `--rerank, --reranking` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
diff --git a/tools/server/server-tools.cpp b/tools/server/server-tools.cpp
index 81e360de4639..49bec4be4f33 100644
--- a/tools/server/server-tools.cpp
+++ b/tools/server/server-tools.cpp
@@ -693,6 +693,35 @@ struct server_tool_apply_diff : server_tool {
     }
 };
 
+//
+// get_datetime: returns the current date and time
+//
+
+struct server_tool_get_datetime : server_tool {
+    server_tool_get_datetime() {
+        name = "get_datetime";
+        display_name = "Get Date & Time";
+        permission_write = false;
+    }
+
+    json get_definition() override {
+        return {
+            {"type", "function"},
+            {"function", {
+                {"name", name},
+                {"description", "Returns the current date and time"},
+            }},
+        };
+    }
+
+    json invoke(json) override {
+        auto now = std::chrono::system_clock::now();
+        auto time = std::chrono::system_clock::to_time_t(now);
+
+        return {{"result", std::ctime(&time)}};
+    }
+};
+
 //
 // public API
 //
@@ -706,6 +735,7 @@ static std::vector<std::unique_ptr<server_tool>> build_tools() {
     tools.push_back(std::make_unique<server_tool_write_file>());
     tools.push_back(std::make_unique<server_tool_edit_file>());
     tools.push_back(std::make_unique<server_tool_apply_diff>());
+    tools.push_back(std::make_unique<server_tool_get_datetime>());
     return tools;
 }
 

From 994118a1831fdede28ee2d228e0570db9b728c45 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Mon, 4 May 2026 12:36:59 +0200
Subject: [PATCH 07/13] model: move `load_hparams` and `load_tensors` to
 per-model definition (#22004)

* git-friendly migration

* add build_graph

* nits

* exclude old code from build

* wip

* add llm_arch_model_i

* prepare downstream functions

* nits

* nits

* wip

* wip

* add back create_tensor_qkv

* fix files missing include

* enforce one llm_build per arch

* cmake: use glob

* missing model params

* nits

* wip

* wip (2)

* wip (3)

* test-llama-archs is happy

* improve switch case

* move more stuff into llm_arch_model_i

* fix downstream code

* nits

* nits (2)

* fix order

* llama_model_base

* LLAMA_LOAD_LOCALS

* small fix

* fix build errors

* auto

* rm migration script and ifdef
---
 src/llama-model.cpp                           | 9202 ++---------------
 src/llama-model.h                             |   89 +-
 src/llama-quant.cpp                           |   23 +-
 src/llama.cpp                                 |  236 +-
 src/models/afmoe.cpp                          |  108 +-
 src/models/apertus.cpp                        |   58 +-
 src/models/arcee.cpp                          |   47 +-
 src/models/arctic.cpp                         |   55 +-
 src/models/arwkv7.cpp                         |  118 +-
 src/models/baichuan.cpp                       |   45 +-
 src/models/bailingmoe.cpp                     |   61 +-
 src/models/bailingmoe2.cpp                    |   96 +-
 src/models/bert.cpp                           |   79 +-
 src/models/bitnet.cpp                         |   49 +-
 src/models/bloom.cpp                          |   64 +-
 src/models/chameleon.cpp                      |   52 +-
 src/models/chatglm.cpp                        |   55 +-
 src/models/codeshell.cpp                      |   51 +-
 src/models/cogvlm.cpp                         |   51 +-
 src/models/{cohere2-iswa.cpp => cohere2.cpp}  |   49 +-
 src/models/command-r.cpp                      |   42 +-
 src/models/dbrx.cpp                           |   46 +-
 src/models/deci.cpp                           |   78 +-
 src/models/deepseek.cpp                       |   73 +-
 src/models/deepseek2.cpp                      |  144 +-
 src/models/deepseek2ocr.cpp                   |   82 +
 src/models/dots1.cpp                          |   72 +-
 src/models/dream.cpp                          |   50 +-
 src/models/ernie4-5-moe.cpp                   |    6 +-
 src/models/ernie4-5.cpp                       |   75 +-
 src/models/eurobert.cpp                       |   37 +-
 src/models/exaone-moe.cpp                     |  113 +-
 src/models/exaone.cpp                         |   45 +-
 src/models/exaone4.cpp                        |   70 +-
 src/models/falcon-h1.cpp                      |  111 +-
 src/models/falcon.cpp                         |   49 +-
 src/models/gemma-embedding.cpp                |   74 +-
 src/models/gemma.cpp                          |   40 +-
 src/models/{gemma2-iswa.cpp => gemma2.cpp}    |   61 +-
 src/models/gemma3.cpp                         |   86 +-
 src/models/{gemma3n-iswa.cpp => gemma3n.cpp}  |   99 +-
 src/models/{gemma4-iswa.cpp => gemma4.cpp}    |  135 +-
 src/models/glm-dsa.cpp                        |  155 +
 src/models/glm4-moe.cpp                       |  135 +-
 src/models/glm4.cpp                           |   74 +-
 src/models/gpt2.cpp                           |   56 +-
 src/models/gptneox.cpp                        |   85 +-
 src/models/granite-hybrid.cpp                 |  137 +-
 src/models/granite-moe.cpp                    |   89 +
 src/models/granite.cpp                        |   93 +-
 src/models/grok.cpp                           |   85 +-
 src/models/grovemoe.cpp                       |   66 +-
 src/models/hunyuan-dense.cpp                  |  132 +-
 src/models/hunyuan-moe.cpp                    |   55 +-
 src/models/hunyuan-vl.cpp                     |  189 +
 src/models/internlm2.cpp                      |   39 +-
 src/models/jais.cpp                           |   54 +-
 src/models/jais2.cpp                          |   57 +-
 src/models/jamba.cpp                          |  107 +-
 src/models/jina-bert-v2.cpp                   |   66 +
 src/models/jina-bert-v3.cpp                   |   69 +
 src/models/kimi-linear.cpp                    |  172 +-
 src/models/lfm2.cpp                           |   92 +-
 src/models/lfm2moe.cpp                        |   85 +
 src/models/llada-moe.cpp                      |   52 +-
 src/models/llada.cpp                          |   68 +-
 src/models/llama-embed.cpp                    |    6 +
 src/models/llama.cpp                          |  100 +-
 src/models/llama4.cpp                         |  108 +-
 src/models/maincoder.cpp                      |   45 +-
 src/models/mamba.cpp                          |   87 +-
 src/models/mamba2.cpp                         |   87 +
 src/models/{mimo2-iswa.cpp => mimo2.cpp}      |   60 +-
 src/models/minicpm.cpp                        |   89 +
 src/models/minicpm3.cpp                       |   62 +-
 src/models/minimax-m2.cpp                     |   46 +-
 src/models/mistral3.cpp                       |   92 +-
 src/models/mistral4.cpp                       |    6 +
 src/models/models.h                           | 1866 +++-
 src/models/modern-bert.cpp                    |   65 +-
 src/models/mpt.cpp                            |   66 +-
 src/models/nemotron-h-moe.cpp                 |    6 +
 src/models/nemotron-h.cpp                     |  127 +-
 src/models/nemotron.cpp                       |   48 +-
 src/models/neo-bert.cpp                       |   42 +-
 src/models/nomic-bert-moe.cpp                 |   72 +
 src/models/nomic-bert.cpp                     |   72 +
 src/models/olmo.cpp                           |   42 +-
 src/models/olmo2.cpp                          |   67 +-
 src/models/olmoe.cpp                          |   51 +-
 .../{openai-moe-iswa.cpp => openai-moe.cpp}   |   63 +-
 src/models/openelm.cpp                        |   49 +-
 src/models/orion.cpp                          |   42 +-
 src/models/paddleocr.cpp                      |    6 +-
 .../{pangu-embedded.cpp => pangu-embed.cpp}   |   56 +-
 src/models/phi2.cpp                           |   46 +-
 src/models/phi3.cpp                           |   70 +-
 src/models/phimoe.cpp                         |   55 +
 src/models/plamo.cpp                          |   38 +-
 src/models/plamo2.cpp                         |  109 +-
 src/models/plamo3.cpp                         |   73 +-
 src/models/plm.cpp                            |   46 +-
 src/models/qwen.cpp                           |   42 +-
 src/models/qwen2.cpp                          |   51 +-
 src/models/qwen2moe.cpp                       |   63 +-
 src/models/qwen2vl.cpp                        |   41 +-
 src/models/qwen3.cpp                          |   51 +-
 src/models/qwen35.cpp                         |  102 +-
 src/models/qwen35moe.cpp                      |  115 +-
 src/models/qwen3moe.cpp                       |   61 +-
 src/models/qwen3next.cpp                      |  119 +-
 src/models/qwen3vl.cpp                        |   52 +-
 .../{qwen3vl-moe.cpp => qwen3vlmoe.cpp}       |   63 +-
 src/models/refact.cpp                         |   77 +-
 src/models/rnd1.cpp                           |   62 +-
 src/models/rwkv6.cpp                          |   93 +-
 src/models/rwkv6qwen2.cpp                     |   83 +-
 src/models/rwkv7.cpp                          |  123 +-
 src/models/seed-oss.cpp                       |   47 +-
 src/models/smallthinker.cpp                   |   79 +-
 src/models/smollm3.cpp                        |   45 +-
 src/models/stablelm.cpp                       |   50 +-
 src/models/starcoder.cpp                      |   58 +-
 src/models/starcoder2.cpp                     |   57 +-
 src/models/{step35-iswa.cpp => step35.cpp}    |  104 +-
 src/models/t5.cpp                             |  122 +-
 src/models/t5encoder.cpp                      |   43 +-
 src/models/wavtokenizer-dec.cpp               |  117 +-
 src/models/xverse.cpp                         |   39 +-
 129 files changed, 11431 insertions(+), 8881 deletions(-)
 rename src/models/{cohere2-iswa.cpp => cohere2.cpp} (60%)
 create mode 100644 src/models/deepseek2ocr.cpp
 rename src/models/{gemma2-iswa.cpp => gemma2.cpp} (53%)
 rename src/models/{gemma3n-iswa.cpp => gemma3n.cpp} (76%)
 rename src/models/{gemma4-iswa.cpp => gemma4.cpp} (64%)
 create mode 100644 src/models/glm-dsa.cpp
 create mode 100644 src/models/granite-moe.cpp
 create mode 100644 src/models/hunyuan-vl.cpp
 create mode 100644 src/models/jina-bert-v2.cpp
 create mode 100644 src/models/jina-bert-v3.cpp
 create mode 100644 src/models/lfm2moe.cpp
 create mode 100644 src/models/llama-embed.cpp
 create mode 100644 src/models/mamba2.cpp
 rename src/models/{mimo2-iswa.cpp => mimo2.cpp} (57%)
 create mode 100644 src/models/minicpm.cpp
 create mode 100644 src/models/mistral4.cpp
 create mode 100644 src/models/nemotron-h-moe.cpp
 create mode 100644 src/models/nomic-bert-moe.cpp
 create mode 100644 src/models/nomic-bert.cpp
 rename src/models/{openai-moe-iswa.cpp => openai-moe.cpp} (51%)
 rename src/models/{pangu-embedded.cpp => pangu-embed.cpp} (53%)
 create mode 100644 src/models/phimoe.cpp
 rename src/models/{qwen3vl-moe.cpp => qwen3vlmoe.cpp} (57%)
 rename src/models/{step35-iswa.cpp => step35.cpp} (52%)

diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 54caff987d14..9a5802e3242d 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -34,6 +34,285 @@
 #include <string>
 #include <vector>
 
+static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params & params) {
+    switch (arch) {
+        case LLM_ARCH_LLAMA:
+            return new llama_model_llama(params);
+        case LLM_ARCH_LLAMA4:
+            return new llama_model_llama4(params);
+        case LLM_ARCH_LLAMA_EMBED:
+            return new llama_model_llama_embed(params);
+        case LLM_ARCH_MAINCODER:
+            return new llama_model_maincoder(params);
+        case LLM_ARCH_DECI:
+            return new llama_model_deci(params);
+        case LLM_ARCH_BAICHUAN:
+            return new llama_model_baichuan(params);
+        case LLM_ARCH_FALCON:
+            return new llama_model_falcon(params);
+        case LLM_ARCH_GROK:
+            return new llama_model_grok(params);
+        case LLM_ARCH_STARCODER:
+            return new llama_model_starcoder(params);
+        case LLM_ARCH_REFACT:
+            return new llama_model_refact(params);
+        case LLM_ARCH_BERT:
+            return new llama_model_bert(params);
+        case LLM_ARCH_JINA_BERT_V2:
+            return new llama_model_jina_bert_v2(params);
+        case LLM_ARCH_JINA_BERT_V3:
+            return new llama_model_jina_bert_v3(params);
+        case LLM_ARCH_NOMIC_BERT:
+            return new llama_model_nomic_bert(params);
+        case LLM_ARCH_NOMIC_BERT_MOE:
+            return new llama_model_nomic_bert_moe(params);
+        case LLM_ARCH_MODERN_BERT:
+            return new llama_model_modern_bert(params);
+        case LLM_ARCH_NEO_BERT:
+            return new llama_model_neo_bert(params);
+        case LLM_ARCH_EUROBERT:
+            return new llama_model_eurobert(params);
+        case LLM_ARCH_BLOOM:
+            return new llama_model_bloom(params);
+        case LLM_ARCH_MPT:
+            return new llama_model_mpt(params);
+        case LLM_ARCH_STABLELM:
+            return new llama_model_stablelm(params);
+        case LLM_ARCH_QWEN:
+            return new llama_model_qwen(params);
+        case LLM_ARCH_QWEN2:
+            return new llama_model_qwen2(params);
+        case LLM_ARCH_DREAM:
+            return new llama_model_dream(params);
+        case LLM_ARCH_LLADA:
+            return new llama_model_llada(params);
+        case LLM_ARCH_LLADA_MOE:
+            return new llama_model_llada_moe(params);
+        case LLM_ARCH_RND1:
+            return new llama_model_rnd1(params);
+        case LLM_ARCH_QWEN2VL:
+            return new llama_model_qwen2vl(params);
+        case LLM_ARCH_QWEN2MOE:
+            return new llama_model_qwen2moe(params);
+        case LLM_ARCH_QWEN3:
+            return new llama_model_qwen3(params);
+        case LLM_ARCH_QWEN3MOE:
+            return new llama_model_qwen3moe(params);
+        case LLM_ARCH_QWEN3VL:
+            return new llama_model_qwen3vl(params);
+        case LLM_ARCH_QWEN3VLMOE:
+            return new llama_model_qwen3vlmoe(params);
+        case LLM_ARCH_PHI2:
+            return new llama_model_phi2(params);
+        case LLM_ARCH_PHI3:
+            return new llama_model_phi3(params);
+        case LLM_ARCH_PHIMOE:
+            return new llama_model_phimoe(params);
+        case LLM_ARCH_PLAMO:
+            return new llama_model_plamo(params);
+        case LLM_ARCH_PLAMO2:
+            return new llama_model_plamo2(params);
+        case LLM_ARCH_PLAMO3:
+            return new llama_model_plamo3(params);
+        case LLM_ARCH_GPT2:
+            return new llama_model_gpt2(params);
+        case LLM_ARCH_CODESHELL:
+            return new llama_model_codeshell(params);
+        case LLM_ARCH_ORION:
+            return new llama_model_orion(params);
+        case LLM_ARCH_INTERNLM2:
+            return new llama_model_internlm2(params);
+        case LLM_ARCH_MINICPM3:
+            return new llama_model_minicpm3(params);
+        case LLM_ARCH_GEMMA:
+            return new llama_model_gemma(params);
+        case LLM_ARCH_GEMMA2:
+            return new llama_model_gemma2(params);
+        case LLM_ARCH_GEMMA3:
+            return new llama_model_gemma3(params);
+        case LLM_ARCH_GEMMA3N:
+            return new llama_model_gemma3n(params);
+        case LLM_ARCH_GEMMA4:
+            return new llama_model_gemma4(params);
+        case LLM_ARCH_GEMMA_EMBEDDING:
+            return new llama_model_gemma_embedding(params);
+        case LLM_ARCH_STARCODER2:
+            return new llama_model_starcoder2(params);
+        case LLM_ARCH_MAMBA:
+            return new llama_model_mamba(params);
+        case LLM_ARCH_MAMBA2:
+            return new llama_model_mamba2(params);
+        case LLM_ARCH_JAMBA:
+            return new llama_model_jamba(params);
+        case LLM_ARCH_XVERSE:
+            return new llama_model_xverse(params);
+        case LLM_ARCH_COMMAND_R:
+            return new llama_model_command_r(params);
+        case LLM_ARCH_COHERE2:
+            return new llama_model_cohere2(params);
+        case LLM_ARCH_DBRX:
+            return new llama_model_dbrx(params);
+        case LLM_ARCH_OLMO:
+            return new llama_model_olmo(params);
+        case LLM_ARCH_OLMO2:
+            return new llama_model_olmo2(params);
+        case LLM_ARCH_OLMOE:
+            return new llama_model_olmoe(params);
+        case LLM_ARCH_OPENELM:
+            return new llama_model_openelm(params);
+        case LLM_ARCH_GPTNEOX:
+            return new llama_model_gptneox(params);
+        case LLM_ARCH_ARCTIC:
+            return new llama_model_arctic(params);
+        case LLM_ARCH_DEEPSEEK:
+            return new llama_model_deepseek(params);
+        case LLM_ARCH_DEEPSEEK2:
+            return new llama_model_deepseek2(params);
+        case LLM_ARCH_DEEPSEEK2OCR:
+            return new llama_model_deepseek2ocr(params);
+        case LLM_ARCH_GLM_DSA:
+            return new llama_model_glm_dsa(params);
+        case LLM_ARCH_MISTRAL4:
+            return new llama_model_mistral4(params);
+        case LLM_ARCH_CHATGLM:
+            return new llama_model_chatglm(params);
+        case LLM_ARCH_GLM4:
+            return new llama_model_glm4(params);
+        case LLM_ARCH_GLM4_MOE:
+            return new llama_model_glm4_moe(params);
+        case LLM_ARCH_BITNET:
+            return new llama_model_bitnet(params);
+        case LLM_ARCH_T5:
+            return new llama_model_t5(params);
+        case LLM_ARCH_T5ENCODER:
+            return new llama_model_t5encoder(params);
+        case LLM_ARCH_JAIS:
+            return new llama_model_jais(params);
+        case LLM_ARCH_JAIS2:
+            return new llama_model_jais2(params);
+        case LLM_ARCH_NEMOTRON:
+            return new llama_model_nemotron(params);
+        case LLM_ARCH_NEMOTRON_H:
+            return new llama_model_nemotron_h(params);
+        case LLM_ARCH_NEMOTRON_H_MOE:
+            return new llama_model_nemotron_h_moe(params);
+        case LLM_ARCH_EXAONE:
+            return new llama_model_exaone(params);
+        case LLM_ARCH_EXAONE4:
+            return new llama_model_exaone4(params);
+        case LLM_ARCH_EXAONE_MOE:
+            return new llama_model_exaone_moe(params);
+        case LLM_ARCH_RWKV6:
+            return new llama_model_rwkv6(params);
+        case LLM_ARCH_RWKV6QWEN2:
+            return new llama_model_rwkv6qwen2(params);
+        case LLM_ARCH_RWKV7:
+            return new llama_model_rwkv7(params);
+        case LLM_ARCH_ARWKV7:
+            return new llama_model_arwkv7(params);
+        case LLM_ARCH_GRANITE:
+            return new llama_model_granite(params);
+        case LLM_ARCH_GRANITE_MOE:
+            return new llama_model_granite_moe(params);
+        case LLM_ARCH_MINICPM:
+            return new llama_model_minicpm(params);
+        case LLM_ARCH_GRANITE_HYBRID:
+            return new llama_model_granite_hybrid(params);
+        case LLM_ARCH_CHAMELEON:
+            return new llama_model_chameleon(params);
+        case LLM_ARCH_WAVTOKENIZER_DEC:
+            return new llama_model_wavtokenizer_dec(params);
+        case LLM_ARCH_PLM:
+            return new llama_model_plm(params);
+        case LLM_ARCH_BAILINGMOE:
+            return new llama_model_bailingmoe(params);
+        case LLM_ARCH_BAILINGMOE2:
+            return new llama_model_bailingmoe2(params);
+        case LLM_ARCH_SEED_OSS:
+            return new llama_model_seed_oss(params);
+        case LLM_ARCH_DOTS1:
+            return new llama_model_dots1(params);
+        case LLM_ARCH_ARCEE:
+            return new llama_model_arcee(params);
+        case LLM_ARCH_AFMOE:
+            return new llama_model_afmoe(params);
+        case LLM_ARCH_ERNIE4_5:
+            return new llama_model_ernie4_5(params);
+        case LLM_ARCH_ERNIE4_5_MOE:
+            return new llama_model_ernie4_5_moe(params);
+        case LLM_ARCH_PADDLEOCR:
+            return new llama_model_paddleocr(params);
+        case LLM_ARCH_HUNYUAN_MOE:
+            return new llama_model_hunyuan_moe(params);
+        case LLM_ARCH_HUNYUAN_VL:
+            return new llama_model_hunyuan_vl(params);
+        case LLM_ARCH_HUNYUAN_DENSE:
+            return new llama_model_hunyuan_dense(params);
+        case LLM_ARCH_SMOLLM3:
+            return new llama_model_smollm3(params);
+        case LLM_ARCH_OPENAI_MOE:
+            return new llama_model_openai_moe(params);
+        case LLM_ARCH_FALCON_H1:
+            return new llama_model_falcon_h1(params);
+        case LLM_ARCH_LFM2:
+            return new llama_model_lfm2(params);
+        case LLM_ARCH_LFM2MOE:
+            return new llama_model_lfm2moe(params);
+        case LLM_ARCH_SMALLTHINKER:
+            return new llama_model_smallthinker(params);
+        case LLM_ARCH_GROVEMOE:
+            return new llama_model_grovemoe(params);
+        case LLM_ARCH_APERTUS:
+            return new llama_model_apertus(params);
+        case LLM_ARCH_MINIMAX_M2:
+            return new llama_model_minimax_m2(params);
+        case LLM_ARCH_COGVLM:
+            return new llama_model_cogvlm(params);
+        case LLM_ARCH_PANGU_EMBED:
+            return new llama_model_pangu_embed(params);
+        case LLM_ARCH_QWEN3NEXT:
+            return new llama_model_qwen3next(params);
+        case LLM_ARCH_QWEN35:
+            return new llama_model_qwen35(params);
+        case LLM_ARCH_QWEN35MOE:
+            return new llama_model_qwen35moe(params);
+        case LLM_ARCH_MISTRAL3:
+            return new llama_model_mistral3(params);
+        case LLM_ARCH_MIMO2:
+            return new llama_model_mimo2(params);
+        case LLM_ARCH_KIMI_LINEAR:
+            return new llama_model_kimi_linear(params);
+        case LLM_ARCH_STEP35:
+            return new llama_model_step35(params);
+        default:
+            GGML_ABORT("unimplemented model class");
+    }
+
+}
+
+llama_model * llama_model_create(llm_arch arch, const llama_model_params & params) {
+    llama_model * model = llama_model_mapping(arch, params);
+
+    if (model != nullptr) {
+        model->arch = arch;
+        auto & devices = model->devices;
+        if (!devices.empty() && devices[0].is_meta && !llm_arch_supports_sm_tensor(arch)) {
+            throw std::runtime_error(std::string("LLAMA_SPLIT_MODE_TENSOR not implemented for architecture '") + llm_arch_name(arch) + "'");
+        }
+    }
+
+    return model;
+}
+
+llama_model * llama_model_create(llama_model_loader & ml, const llama_model_params & params) {
+    llm_arch arch = ml.get_arch();
+    if (arch == LLM_ARCH_UNKNOWN) {
+        throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
+    }
+
+    return llama_model_create(arch, params);
+}
+
 struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const struct ggml_tensor * tensor, void * userdata) {
     const llama_meta_device_get_split_state_userdata * ud = (const llama_meta_device_get_split_state_userdata *) userdata;
     const llama_hparams & hparams = ud->model->hparams;
@@ -688,22 +967,12 @@ llama_model::~llama_model() {
     }
 }
 
-void llama_model::load_stats(llama_model_loader & ml) {
+void llama_model_base::load_stats(llama_model_loader & ml) {
     pimpl->n_elements = ml.n_elements;
     pimpl->n_bytes = ml.n_bytes;
 }
 
-void llama_model::load_arch(llama_model_loader & ml) {
-    arch = ml.get_arch();
-    if (arch == LLM_ARCH_UNKNOWN) {
-        throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
-    }
-    if (!devices.empty() && devices[0].is_meta && !llm_arch_supports_sm_tensor(arch)) {
-        throw std::runtime_error(std::string("LLAMA_SPLIT_MODE_TENSOR not implemented for architecture '") + llm_arch_name(arch) + "'");
-    }
-}
-
-void llama_model::load_hparams(llama_model_loader & ml) {
+void llama_model_base::load_hparams(llama_model_loader & ml) {
     const gguf_context * ctx = ml.metadata;
 
     // get metadata as string
@@ -872,8205 +1141,924 @@ void llama_model::load_hparams(llama_model_loader & ml) {
         hparams.n_cls_out = classifier_labels.size();
     }
 
-    // arch-specific KVs
-    switch (arch) {
-        case LLM_ARCH_LLAMA:
-        case LLM_ARCH_LLAMA_EMBED:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-
-                if (hparams.n_expert == 8) {
-                    switch (hparams.n_layer) {
-                        case 32: type = LLM_TYPE_8x7B; break;
-                        case 56: type = LLM_TYPE_8x22B; break;
-                        default: type = LLM_TYPE_UNKNOWN;
-                    }
-                } else {
-                    switch (hparams.n_layer) {
-                        case 16: type = LLM_TYPE_1B; break; // Llama 3.2 1B
-                        case 22: type = LLM_TYPE_1B; break;
-                        case 26: type = LLM_TYPE_3B; break;
-                        case 28: type = LLM_TYPE_3B; break; // Llama 3.2 3B
-                        case 30: type = LLM_TYPE_256M; break; // smoldocling 256M
-                        // granite uses a vocab with len 49152
-                        case 32: type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break;
-                        case 36: type = LLM_TYPE_8B; break; // granite
-                        case 40: type = LLM_TYPE_13B; break;
-                        case 48: type = LLM_TYPE_34B; break;
-                        case 60: type = LLM_TYPE_30B; break;
-                        case 80: type = hparams.n_head() == hparams.n_head_kv() ? LLM_TYPE_65B : LLM_TYPE_70B; break;
-                        default: type = LLM_TYPE_UNKNOWN;
-                    }
-                }
-            } break;
-        case LLM_ARCH_LLAMA4:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
-                ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP,   hparams.n_moe_layer_step);
-
-                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
-                if (found_swa && hparams.n_swa == 0) {
-                    hparams.swa_type             = LLAMA_SWA_TYPE_NONE;
-                    hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope
-                } else {
-                    hparams.swa_type                = LLAMA_SWA_TYPE_CHUNKED;
-                    hparams.n_swa                   = 8192;
-                    hparams.n_attn_temp_floor_scale = 8192;
-                    hparams.f_attn_temp_scale       = 0.1f;
-                    hparams.f_attn_temp_offset      = 1.0f;
-                    uint32_t swa_period             = 4; // pattern: 3 chunked - 1 full
-                    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
-                    hparams.set_swa_pattern(swa_period);
-
-                    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
-                    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
-                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
-                }
+    // per-arch hparams
+    load_arch_hparams(ml);
 
-                switch (hparams.n_expert) {
-                    case 0: {
-                        // MobileLLM (no MoE)
-                        switch (hparams.n_embd) {
-                            case 2048: type = LLM_TYPE_140M; break;
-                            case 4096: type = LLM_TYPE_360M; break;
-                            case 6144: type = LLM_TYPE_950M; break;
-                            default:   type = LLM_TYPE_UNKNOWN;
-                        }
-                    } break;
-                    case 16:  type = LLM_TYPE_17B_16E; break;
-                    case 128: type = LLM_TYPE_17B_128E; break;
-                    default:  type = LLM_TYPE_UNKNOWN;
-                }
+    pimpl->n_bytes = ml.n_bytes;
 
-                hparams.use_kq_norm = type != LLM_TYPE_17B_128E;
-            } break;
-        case LLM_ARCH_ARCEE:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
 
-                // Arcee uses the same structure as Llama
-                switch (hparams.n_layer) {
-                    case 36: type = LLM_TYPE_4B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_AFMOE:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
-                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
-                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
-                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
-                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
-                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa, false);
-
-                // Set up interleaved sliding window attention (ISWA)
-                // Pattern: 3 sliding - 1 full (global_attn_every_n_layers = 4)
-                if (hparams.n_swa > 0) {
-                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-                    uint32_t swa_period = 4;
-                    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
-                    hparams.set_swa_pattern(swa_period);
-
-                    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
-                    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
-                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
-                } else {
-                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
-                }
+    if (hparams.f_max_alibi_bias > 0.0f) {
+        hparams.use_alibi = true;
+    }
 
-                // Default to sigmoid if not set
-                if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
-                    hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
-                }
+    hparams.rope_type = llama_model_rope_type(this);
+}
 
-                switch (hparams.n_layer) {
-                    case 56: type = LLM_TYPE_6B; break;
-                    case 32: type = LLM_TYPE_26B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_DECI:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                switch (hparams.n_layer) {
-                    case 32: type = LLM_TYPE_7B; break;
-                    case 80: type = LLM_TYPE_70B; break;
-                    case 162: type = LLM_TYPE_405B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_MINICPM:
-            {
-                // Backward-compatible defaults for older MiniCPM GGUFs
-                hparams.f_embedding_scale = 12.0f;
-                hparams.f_residual_scale  = 1.4f / sqrtf(float(hparams.n_layer));
-                hparams.f_logit_scale     = hparams.n_embd ? (256.0f / float(hparams.n_embd)) : 1.0f;
+void llama_model_base::load_vocab(llama_model_loader & ml) {
+    const auto kv = LLM_KV(arch);
 
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    vocab.load(ml, kv);
+}
 
-                // Optional KV reads, override defaults if present in newer GGUF exports
-                ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, /*required=*/false);
-                ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale, /*required=*/false);
-                ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, /*required=*/false);
+bool llama_model_base::load_tensors(llama_model_loader & ml) {
+    const auto & split_mode   = params.split_mode;
+    const auto & use_mlock    = params.use_mlock;
+    const auto & tensor_split = params.tensor_split;
 
-                // MiniCPM uses rope by default, unlike Granite which uses it as a switch
-                hparams.rope_finetuned = true;
+    const int n_layer      = hparams.n_layer;
+    const int n_gpu_layers = this->n_gpu_layers();
 
-                switch (hparams.n_layer) {
-                    case 52: type = LLM_TYPE_1B; break;
-                    case 40: type = LLM_TYPE_2B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_MINICPM3:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK,       hparams.n_lora_q);
-                ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,      hparams.n_lora_kv);
+    const bool use_mmap_buffer = true;
 
-                switch (hparams.n_layer) {
-                    case 62: type = LLM_TYPE_4B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_GROK:
-            {
-                // defaults for old GGUFs
-                hparams.yarn_beta_fast = 8.0f;
-                hparams.f_logit_scale = 0.5773502691896257f;
-                hparams.f_embedding_scale = 78.38367176906169f;
-                hparams.f_attn_out_scale = 0.08838834764831845f;
-                hparams.f_attn_logit_softcapping = 30.0f;
-                hparams.f_router_logit_softcapping = 30.0f;
-                // no final_logit_softcapping in grok-1
-                hparams.f_final_logit_softcapping = 0.0f;
-
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,  hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,   hparams.n_ff_exp, false);
-                ml.get_key(LLM_KV_LOGIT_SCALE,                  hparams.f_logit_scale, false);
-                ml.get_key(LLM_KV_EMBEDDING_SCALE,              hparams.f_embedding_scale, false);
-                ml.get_key(LLM_KV_ATTENTION_OUTPUT_SCALE,       hparams.f_attn_out_scale, false);
-                ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING,       hparams.f_attn_logit_softcapping, false);
-                ml.get_key(LLM_KV_ROUTER_LOGIT_SOFTCAPPING,     hparams.f_router_logit_softcapping, false);
-                ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING,      hparams.f_final_logit_softcapping, false);
-
-                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH,  hparams.attn_temp_length, false);
-                ml.get_key(LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR,  hparams.yarn_ext_factor, false);
-                ml.get_key(LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, hparams.yarn_attn_factor, false);
-                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST,   hparams.yarn_beta_fast, false);
-                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW,   hparams.yarn_beta_slow, false);
-
-                switch (hparams.n_layer) {
-                    case 64: type = LLM_TYPE_314B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_FALCON:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+    this->ml = &ml; // to be used by create_tensor() and load_arch_tensors()
 
-                switch (hparams.n_layer) {
-                    case 32: type = LLM_TYPE_7B; break;
-                    case 60: type = LLM_TYPE_40B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_BAICHUAN:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                switch (hparams.n_layer) {
-                    case 32: type = LLM_TYPE_7B; break;
-                    case 40: type = LLM_TYPE_13B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
+    LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s, direct_io = %s)\n",
+        __func__, ml.use_mmap ? "true" : "false", ml.use_direct_io ? "true" : "false");
 
-                if (type == LLM_TYPE_13B) {
-                    // TODO: become GGUF KV parameter
-                    hparams.f_max_alibi_bias = 8.0f;
-                }
-            } break;
-        case LLM_ARCH_STARCODER:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-                switch (hparams.n_layer) {
-                    case 24: type = LLM_TYPE_1B; break;
-                    case 36: type = LLM_TYPE_3B; break;
-                    case 42: type = LLM_TYPE_7B; break;
-                    case 40: type = LLM_TYPE_15B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_REFACT:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                switch (hparams.n_layer) {
-                    case 32: type = LLM_TYPE_1B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
+    // build a list of buffer types for the CPU and GPU devices
+    pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host);
+    for (const auto & dev : devices) {
+        buft_list_t buft_list = make_gpu_buft_list(dev.dev, split_mode, tensor_split);
+        // add CPU buffer types as a fallback
+        buft_list.insert(buft_list.end(), pimpl->cpu_buft_list.begin(), pimpl->cpu_buft_list.end());
+        pimpl->gpu_buft_list.emplace(dev.dev, std::move(buft_list));
+    }
 
-                // TODO: become GGUF KV parameter
-                hparams.f_max_alibi_bias = 8.0f;
-            } break;
-        case LLM_ARCH_BERT:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
-
-                switch (hparams.n_layer) {
-                    case 3:
-                        type = LLM_TYPE_17M; break; // bge-micro
-                    case 6:
-                        type = LLM_TYPE_22M; break; // MiniLM-L6
-                    case 12:
-                        switch (hparams.n_embd) {
-                            case 384: type = LLM_TYPE_33M; break; // MiniLM-L12, bge-small
-                            case 768: type = LLM_TYPE_109M; break; // bge-base
-                            default: type = LLM_TYPE_UNKNOWN;
-                        } break;
-                    case 24:
-                        type = LLM_TYPE_335M; break; // bge-large
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_MODERN_BERT:
-            {
-                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
-                if (found_swa && hparams.n_swa > 0) {
-                    hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
-                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
-                    uint32_t swa_period = 3;
-                    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
-                    hparams.set_swa_pattern(swa_period, true);
-                } else {
-                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
-                }
+    ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+    if (cpu_dev == nullptr) {
+        throw std::runtime_error(format("%s: no CPU backend found", __func__));
+    }
 
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+    // calculate the split points
+    bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
+    std::vector<float> splits(n_devices());
+    if (all_zero) {
+        // default split, by free memory
+        for (size_t i = 0; i < n_devices(); ++i) {
+            ggml_backend_dev_t dev = devices[i].dev;
+            size_t total;
+            size_t free;
+            ggml_backend_dev_memory(dev, &free, &total);
 
-                switch (hparams.n_layer) {
-                    case 12:
-                        type = LLM_TYPE_47M; break; // granite-embedding-small
-                    case 22:
-                        type = LLM_TYPE_149M; break; // modern-bert-base
-                    case 28:
-                        type = LLM_TYPE_395M; break; // modern-bert-large
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_JINA_BERT_V2:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
-                hparams.f_max_alibi_bias = 8.0f;
-
-                switch (hparams.n_layer) {
-                    case 4:  type = LLM_TYPE_33M;  break; // jina-embeddings-small
-                    case 12: type = LLM_TYPE_137M; break; // jina-embeddings-base
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_JINA_BERT_V3:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
-
-                switch (hparams.n_layer) {
-                    case 24:
-                        type = LLM_TYPE_558M; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_NOMIC_BERT:
-        case LLM_ARCH_NOMIC_BERT_MOE:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
-                ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS,         hparams.moe_every_n_layers, 0);
-
-                if (hparams.n_layer == 12 && hparams.n_embd == 768) {
-                    if (arch == LLM_ARCH_NOMIC_BERT) {
-                        type = LLM_TYPE_137M;
-                    } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
-                        type = LLM_TYPE_475M;
-                    }
-                }
-            } break;
-        case LLM_ARCH_NEO_BERT:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-
-                if (hparams.n_layer == 28) {
-                    type = LLM_TYPE_250M;
-                }
-            } break;
-        case LLM_ARCH_EUROBERT:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-
-                if (hparams.n_layer == 12) {
-                    type = LLM_TYPE_SMALL;  // 0.2B
-                }
-            } break;
-        case LLM_ARCH_BLOOM:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-
-                switch (hparams.n_layer) {
-                    case 24: type = LLM_TYPE_1B; break;
-                    case 30:
-                        switch (hparams.n_embd) {
-                            case 2560: type = LLM_TYPE_3B; break;
-                            case 4096: type = LLM_TYPE_7B; break;
-                            default: type = LLM_TYPE_UNKNOWN;
-                        } break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-
-                // TODO: become GGUF KV parameter
-                hparams.f_max_alibi_bias = 8.0f;
-            } break;
-        case LLM_ARCH_MPT:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,  hparams.f_norm_eps);
-                ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV,      hparams.f_clamp_kqv, false);
-                ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias, false);
-
-                switch (hparams.n_layer) {
-                    case 32: type = LLM_TYPE_7B; break;
-                    case 48: type = LLM_TYPE_30B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_STABLELM:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-
-                switch (hparams.n_layer) {
-                    case 24: type = LLM_TYPE_1B; break;
-                    case 32: type = LLM_TYPE_3B; break;
-                    case 40: type = LLM_TYPE_12B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-               }
-            } break;
-        case LLM_ARCH_QWEN:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-
-                switch (hparams.n_layer) {
-                    case 32: type = LLM_TYPE_7B; break;
-                    case 40: type = LLM_TYPE_13B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_QWEN2VL:
-            {
-                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
+            // devices can return 0 bytes for free and total memory if they do not
+            // have any to report. in this case, we will use the host memory as a fallback
+            // fixes: https://github.com/ggml-org/llama.cpp/issues/18577
+            if (free == 0 && total == 0) {
+                ggml_backend_dev_memory(cpu_dev, &free, &total);
             }
-            // fall through
-        case LLM_ARCH_QWEN2:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                switch (hparams.n_layer) {
-                    case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
-                    case 28: type = hparams.n_embd == 1536 ? LLM_TYPE_1_5B : LLM_TYPE_7B; break;
-                    case 32: type = LLM_TYPE_7B; break;
-                    case 36: type = LLM_TYPE_3B; break;
-                    case 40: type = hparams.n_head() == 20 ? LLM_TYPE_4B : LLM_TYPE_13B; break;
-                    case 48: type = LLM_TYPE_14B; break;
-                    case 64: type = LLM_TYPE_32B; break;
-                    case 80: type = LLM_TYPE_70B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_DREAM:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                // Dream models are primarily 7B with 28 layers
-                switch (hparams.n_layer) {
-                    case 28:
-                        type = LLM_TYPE_7B;
-                        break;
-                    default:
-                        type = LLM_TYPE_UNKNOWN;
-                }
-                // Set non-causal attention for diffusion models
-                hparams.causal_attn = false;
-            } break;
-        case LLM_ARCH_LLADA:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                // LLaDA-8B has 32 layers, similar to LLaMA but for diffusion
-                switch (hparams.n_layer) {
-                    case 32:
-                        type = LLM_TYPE_8B;
-                        break;
-                    default:
-                        type = LLM_TYPE_UNKNOWN;
-                }
-                // Set non-causal attention for diffusion models
-                hparams.causal_attn = false;
-            } break;
-        case LLM_ARCH_LLADA_MOE:
-            {
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
-
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                // diffusion language model uses non-causal attention
-                hparams.causal_attn = false;
-                switch (hparams.n_layer) {
-                    case 16: type = LLM_TYPE_A1_7B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_RND1:
-            {
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
-
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                switch (hparams.n_layer) {
-                    case 48: type = LLM_TYPE_30B_A3B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-                // Set non-causal attention for diffusion models
-                hparams.causal_attn = false;
-            } break;
-        case LLM_ARCH_QWEN2MOE:
-            {
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp, false);
-                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
-
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                switch (hparams.n_layer) {
-                    case 24: type = LLM_TYPE_A2_7B; break;
-                    case 28: type = LLM_TYPE_57B_A14B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_QWEN3:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                switch (hparams.n_layer) {
-                    case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
-                    case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
-                    case 40: type = LLM_TYPE_14B; break;
-                    case 64: type = LLM_TYPE_32B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_MAINCODER:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                switch (hparams.n_layer) {
-                    case 32: type = LLM_TYPE_1B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_QWEN3VL:
-            {
-                ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false);
-                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                switch (hparams.n_layer) {
-                    case 28: type = LLM_TYPE_1_7B; break;
-                    case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
-                    case 64: type = LLM_TYPE_32B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_QWEN3MOE:
-            {
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp, false);
-
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                switch (hparams.n_layer) {
-                    case 48: type = LLM_TYPE_30B_A3B; break;
-                    case 94: type = LLM_TYPE_235B_A22B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_QWEN3VLMOE:
-            {
-                ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false);
-                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                switch (hparams.n_layer) {
-                    case 48: type = LLM_TYPE_30B_A3B; break;
-                    case 94: type = LLM_TYPE_235B_A22B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_PHI2:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+            splits[i] = free;
+        }
+    } else {
+        std::copy(tensor_split, tensor_split + n_devices(), splits.begin());
+    }
 
-                switch (hparams.n_layer) {
-                    case 24: type = LLM_TYPE_1B; break;
-                    case 32: type = LLM_TYPE_3B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_PHI3:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    // sum and normalize the splits to get the split points
+    float split_sum = 0.0f;
+    for (size_t i = 0; i < n_devices(); ++i) {
+        split_sum += splits[i];
+        splits[i] = split_sum;
+    }
+    for (size_t i = 0; i < n_devices(); ++i) {
+        splits[i] /= split_sum;
+    }
 
-                switch (hparams.n_layer) {
-                    case 24: type = LLM_TYPE_1B; break;
-                    case 32: type = LLM_TYPE_3B; break;
-                    case 40: type = LLM_TYPE_14B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
+    const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0);
+    const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1);
+    auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
+        const bool is_swa = il < int(hparams.n_layer) && hparams.is_swa(il);
+        if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
+            LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
+            return {cpu_dev, &pimpl->cpu_buft_list};
+        }
+        const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
+        auto * dev = devices.at(layer_gpu).dev;
+        LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(dev), is_swa);
+        return {dev, &pimpl->gpu_buft_list.at(dev)};
+    };
 
-                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+    // assign the input layer
+    // there is very little benefit to offloading the input layer, so always keep it on the CPU
+    pimpl->dev_input = { cpu_dev, &pimpl->cpu_buft_list };
 
-                if (found_swa && hparams.n_swa > 0) {
-                    LLAMA_LOG_WARN("%s: Phi SWA is currently disabled - results might be suboptimal for some models (see %s)\n",
-                            __func__, "https://github.com/ggml-org/llama.cpp/pull/13676");
+    // assign the repeating layers to the devices according to the splits
+    pimpl->dev_layer.resize(n_layer);
+    for (int il = 0; il < n_layer; ++il) {
+        pimpl->dev_layer[il] = get_layer_buft_list(il);
+    }
 
-                    // TODO: fix conversion scripts to correctly populate `n_swa` and `n_swa_pattern`
-                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
+    // assign the output layer
+    pimpl->dev_output = get_layer_buft_list(n_layer);
 
-                    hparams.n_swa         = 0;
-                    hparams.set_swa_pattern(1);
-                }
-            } break;
-        case LLM_ARCH_PHIMOE:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
 
-                switch (hparams.n_layer) {
-                    case 32: type = LLM_TYPE_16x3_8B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_PLAMO:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    // create tensors for the weights
+    {
+        // TODO: move to a separate function
+        const auto tn = LLM_TN(arch);
 
-                switch (hparams.n_layer) {
-                    case 40: type = LLM_TYPE_13B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-               }
-            } break;
-        case LLM_ARCH_PLAMO2:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+        const int64_t n_expert      = hparams.n_expert;
+        const int64_t n_expert_used = hparams.n_expert_used;
 
-                // Load Mamba SSM parameters
-                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
-                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
-                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
-                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
-                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
+        if (n_expert > 0 && n_expert_used == 0) {
+            throw std::runtime_error("model has expert layers but no expert layers are used");
+        }
 
-                for (uint32_t i = 0; i < hparams.n_layer; ++i) {
-                    hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
-                }
+        layers.resize(n_layer);
 
-                switch (hparams.n_layer) {
-                    case 16: type = LLM_TYPE_1B; break;
-                    case 32:
-                        if (hparams.n_embd == 2048) {
-                            type = LLM_TYPE_2B;
-                        } else if (hparams.n_embd == 4096) {
-                            type = LLM_TYPE_8B;
-                        }
-                        break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_PLAMO3:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
-                if (found_swa && hparams.n_swa > 0) {
-                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
-                    uint32_t swa_period = 8;
-                    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
-                    hparams.set_swa_pattern(swa_period);
-                } else {
-                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
-                }
+        // call the per-model loading function
+        load_arch_tensors(ml);
 
-                switch (hparams.n_layer) {
-                    case 24: type = LLM_TYPE_2B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_GPT2:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-                switch (hparams.n_layer) {
-                    case 12: type = LLM_TYPE_SMALL; break;
-                    case 24: type = LLM_TYPE_MEDIUM; break;
-                    case 36: type = LLM_TYPE_LARGE; break;
-                    case 48: type = LLM_TYPE_XL; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_CODESHELL:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-                switch (hparams.n_layer) {
-                    case 42: type = LLM_TYPE_7B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_ORION:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+        // generic pass: load optional per-tensor/per-expert ".scale" tensors (e.g. NVFP4 scale2)
+        // this avoids having to add scale loading to every architecture
+        for (int i = 0; i < n_layer; ++i) {
+            auto & layer = layers[i];
 
-                switch (hparams.n_layer) {
-                    case 40: type = LLM_TYPE_14B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_INTERNLM2:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                switch (hparams.n_layer) {
-                    case 32: type = LLM_TYPE_7B; break;
-                    case 48: type = LLM_TYPE_20B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_GEMMA:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+            // attention weight scales (per-tensor, shape {1})
+            if (!layer.wq_s && layer.wq) {
+                layer.wq_s = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.wk_s && layer.wk) {
+                layer.wk_s = create_tensor(tn(LLM_TENSOR_ATTN_K,   "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.wv_s && layer.wv) {
+                layer.wv_s = create_tensor(tn(LLM_TENSOR_ATTN_V,   "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.wo_s && layer.wo) {
+                layer.wo_s = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.wqkv_s && layer.wqkv) {
+                layer.wqkv_s = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.wqkv_gate_s && layer.wqkv_gate) {
+                layer.wqkv_gate_s = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
 
-                switch (hparams.n_layer) {
-                    case 18: type = LLM_TYPE_2B; break;
-                    case 28: type = LLM_TYPE_7B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-               }
-            } break;
-        case LLM_ARCH_GEMMA2:
-            {
-                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-                hparams.n_swa = 4096; // default value of gemma 2
-                uint32_t swa_period = 2;
-                ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
-                hparams.set_swa_pattern(swa_period);
-                hparams.attn_soft_cap = true;
-                hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
-                hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
-
-                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,          hparams.rope_freq_base_train_swa, false);
-                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa, false);
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING,      hparams.f_attn_logit_softcapping, false);
-                ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING,     hparams.f_final_logit_softcapping, false);
-
-                switch (hparams.n_layer) {
-                    case 26: type = LLM_TYPE_2B; break;
-                    case 42: type = LLM_TYPE_9B; break;
-                    case 46: type = LLM_TYPE_27B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-               }
-
-                // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L173
-                hparams.f_attention_scale = type == LLM_TYPE_27B
-                    ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
-                    : 1.0f / std::sqrt(float(hparams.n_embd_head_k()));
-            } break;
-        case LLM_ARCH_GEMMA3:
-            {
-                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
-                if (found_swa && hparams.n_swa > 0) {
-                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-                    uint32_t swa_period = 6;
-                    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
-                    hparams.set_swa_pattern(swa_period);
-
-                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
-                } else {
-                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
-                }
+            // dense FFN weight scales (per-tensor, shape {1})
+            if (!layer.ffn_gate_s && layer.ffn_gate) {
+                layer.ffn_gate_s = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.ffn_down_s && layer.ffn_down) {
+                layer.ffn_down_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.ffn_up_s && layer.ffn_up) {
+                layer.ffn_up_s = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.ffn_gate_shexp_s && layer.ffn_gate_shexp) {
+                layer.ffn_gate_shexp_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.ffn_down_shexp_s && layer.ffn_down_shexp) {
+                layer.ffn_down_shexp_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.ffn_up_shexp_s && layer.ffn_up_shexp) {
+                layer.ffn_up_shexp_s = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
 
-                hparams.f_final_logit_softcapping = 0.0f;
-                ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-
-                switch (hparams.n_layer) {
-                    case 18: type = LLM_TYPE_270M; break;
-                    case 26: type = LLM_TYPE_1B; break;
-                    case 32: type = LLM_TYPE_8B; break; // Rnj-1
-                    case 34: type = LLM_TYPE_4B; break;
-                    case 48: type = LLM_TYPE_12B; break;
-                    case 62: type = LLM_TYPE_27B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
+            // MoE expert weight scales (per-expert, shape {n_expert})
+            if (!layer.ffn_gate_exps_s && layer.ffn_gate_exps) {
+                layer.ffn_gate_exps_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.ffn_down_exps_s && layer.ffn_down_exps) {
+                layer.ffn_down_exps_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.ffn_up_exps_s && layer.ffn_up_exps) {
+                layer.ffn_up_exps_s = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
+            }
 
-                // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L289
-                hparams.f_attention_scale = type == LLM_TYPE_27B
-                    ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
-                    : 1.0f / std::sqrt(float(hparams.n_embd_head_k()));
-            } break;
-        case LLM_ARCH_GEMMA3N:
-            {
-                uint32_t swa_period = 5;
-                ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
-                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-                hparams.set_swa_pattern(swa_period);
-
-                hparams.n_layer_kv_from_start     = 20;
-                hparams.f_attention_scale         = 1.0f;
-
-                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,          hparams.rope_freq_base_train_swa, false);
-                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa);
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-
-                switch (hparams.n_layer) {
-                    case 30: type = LLM_TYPE_E2B; break;
-                    case 35: type = LLM_TYPE_E4B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_GEMMA4:
-            {
-                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-                ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
-
-                uint32_t n_kv_shared_layers = 0;
-                ml.get_key(LLM_KV_ATTENTION_SHARED_KV_LAYERS, n_kv_shared_layers, false);
-
-                hparams.n_layer_kv_from_start = hparams.n_layer - (int32_t)n_kv_shared_layers;
-                hparams.f_attention_scale     = 1.0f; // Gemma4 uses self.scaling = 1.0 (no pre-attn scaling)
-
-                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,          hparams.rope_freq_base_train_swa, false);
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp, false);
-                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa);
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_EMBEDDING_LENGTH_PER_LAYER,  hparams.n_embd_per_layer);
-                ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_SWA,    hparams.n_embd_head_k_swa);
-                ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_SWA,  hparams.n_embd_head_v_swa);
-                ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING,     hparams.f_final_logit_softcapping, false);
-
-                switch (hparams.n_layer) {
-                    case 30: type = LLM_TYPE_26B_A4B; break;
-                    case 35: type = LLM_TYPE_E2B; break;
-                    case 42: type = LLM_TYPE_E4B; break;
-                    case 60: type = LLM_TYPE_31B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_GEMMA_EMBEDDING:
-            {
-                hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
-                uint32_t swa_period = 6;
-                ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
-                hparams.set_swa_pattern(swa_period);
+            // recurrent / linear-attention weight scales (per-tensor, shape {1})
+            if (!layer.ssm_in_s && layer.ssm_in) {
+                layer.ssm_in_s = create_tensor(tn(LLM_TENSOR_SSM_IN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.ssm_out_s && layer.ssm_out) {
+                layer.ssm_out_s = create_tensor(tn(LLM_TENSOR_SSM_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.ssm_alpha_s && layer.ssm_alpha) {
+                layer.ssm_alpha_s = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.ssm_beta_s && layer.ssm_beta) {
+                layer.ssm_beta_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
 
-                hparams.causal_attn = false; // embeddings do not use causal attention
+            // input scales
+            if (!layer.wq_in_s && layer.wq) {
+                layer.wq_in_s = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.wk_in_s && layer.wk) {
+                layer.wk_in_s = create_tensor(tn(LLM_TENSOR_ATTN_K,   "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.wv_in_s && layer.wv) {
+                layer.wv_in_s = create_tensor(tn(LLM_TENSOR_ATTN_V,   "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.wo_in_s && layer.wo) {
+                layer.wo_in_s = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.wqkv_in_s && layer.wqkv) {
+                layer.wqkv_in_s = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.wqkv_gate_in_s && layer.wqkv_gate) {
+                layer.wqkv_gate_in_s = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.ffn_gate_in_s && layer.ffn_gate) {
+                layer.ffn_gate_in_s = create_tensor(tn(LLM_TENSOR_FFN_GATE, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.ffn_down_in_s && layer.ffn_down) {
+                layer.ffn_down_in_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.ffn_up_in_s && layer.ffn_up) {
+                layer.ffn_up_in_s = create_tensor(tn(LLM_TENSOR_FFN_UP, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.ffn_gate_exps_in_s && layer.ffn_gate_exps) {
+                layer.ffn_gate_exps_in_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "input_scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.ffn_down_exps_in_s && layer.ffn_down_exps) {
+                layer.ffn_down_exps_in_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "input_scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.ffn_up_exps_in_s && layer.ffn_up_exps) {
+                layer.ffn_up_exps_in_s = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "input_scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.ffn_gate_shexp_in_s && layer.ffn_gate_shexp) {
+                layer.ffn_gate_shexp_in_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.ffn_down_shexp_in_s && layer.ffn_down_shexp) {
+                layer.ffn_down_shexp_in_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.ffn_up_shexp_in_s && layer.ffn_up_shexp) {
+                layer.ffn_up_shexp_in_s = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.ssm_in_in_s && layer.ssm_in) {
+                layer.ssm_in_in_s = create_tensor(tn(LLM_TENSOR_SSM_IN, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.ssm_out_in_s && layer.ssm_out) {
+                layer.ssm_out_in_s = create_tensor(tn(LLM_TENSOR_SSM_OUT, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.ssm_alpha_in_s && layer.ssm_alpha) {
+                layer.ssm_alpha_in_s = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.ssm_beta_in_s && layer.ssm_beta) {
+                layer.ssm_beta_in_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+        }
+    }
 
-                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
-                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    ml.done_getting_tensors();
 
-                //applied only if model converted with --sentence-transformers-dense-modules
-                ml.get_key(LLM_KV_DENSE_2_FEAT_IN, hparams.dense_2_feat_in, false);
-                ml.get_key(LLM_KV_DENSE_2_FEAT_OUT, hparams.dense_2_feat_out, false);
-                ml.get_key(LLM_KV_DENSE_3_FEAT_IN, hparams.dense_3_feat_in, false);
-                ml.get_key(LLM_KV_DENSE_3_FEAT_OUT, hparams.dense_3_feat_out, false);
+    // populate tensors_by_name
+    for (auto & [_, ctx_ptr] : ml.ctx_map) {
+        for (auto * cur = ggml_get_first_tensor(ctx_ptr.get()); cur != NULL; cur = ggml_get_next_tensor(ctx_ptr.get(), cur)) {
+            tensors_by_name.emplace_back(ggml_get_name(cur), cur);
+        }
+    }
 
-                GGML_ASSERT((hparams.dense_2_feat_in == 0 || hparams.dense_2_feat_in == hparams.n_embd) && "dense_2_feat_in must be equal to n_embd");
-                GGML_ASSERT((hparams.dense_3_feat_out == 0 || hparams.dense_3_feat_out == hparams.n_embd) && "dense_3_feat_out must be equal to n_embd");
+    ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr);
+    pimpl->mappings.reserve(ml.mappings.size());
 
-                switch (hparams.n_layer) {
-                    case 24: type = LLM_TYPE_0_3B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-                hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k()));
+    // create the backend buffers
+    std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_buf_maps;
+    ctx_buf_maps.reserve(ml.ctx_map.size());
 
-            } break;
-        case LLM_ARCH_STARCODER2:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-                switch (hparams.n_layer) {
-                    case 30: type = LLM_TYPE_3B; break;
-                    case 32: type = LLM_TYPE_7B; break;
-                    case 40: type = LLM_TYPE_15B; break;
-                    case 52: type = LLM_TYPE_20B; break; // granite
-                    case 88: type = LLM_TYPE_34B; break; // granite
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_MAMBA:
-            {
-                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
-                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
-                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
-                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
-                ml.get_key(LLM_KV_SSM_DT_B_C_RMS,     hparams.ssm_dt_b_c_rms, false);
-
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-
-                switch (hparams.n_layer) {
-                    case 24:
-                        switch (hparams.n_embd) {
-                            case 768: type = LLM_TYPE_SMALL; break;
-                            default: type = LLM_TYPE_UNKNOWN;
-                        } break;
-                    case 48:
-                        switch (hparams.n_embd) {
-                            case 1024: type = LLM_TYPE_MEDIUM; break;
-                            case 1536: type = LLM_TYPE_LARGE; break;
-                            case 2048: type = LLM_TYPE_XL; break;
-                            default:   type = LLM_TYPE_UNKNOWN;
-                        } break;
-                    case 64:
-                        switch (hparams.n_embd) {
-                            case 2560: type = LLM_TYPE_3B; break;
-                            default: type = LLM_TYPE_UNKNOWN;
-                        } break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_MAMBA2:
-            {
-                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
-                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
-                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
-                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
-                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
-
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-
-                switch (hparams.n_layer) {
-                    case 24:
-                        switch (hparams.n_embd) {
-                            case 768: type = LLM_TYPE_SMALL; break;
-                            default: type = LLM_TYPE_UNKNOWN;
-                        } break;
-                    case 48:
-                        switch (hparams.n_embd) {
-                            case 1024: type = LLM_TYPE_MEDIUM; break;
-                            case 1536: type = LLM_TYPE_LARGE; break;
-                            case 2048: type = LLM_TYPE_XL; break;
-                            default: type = LLM_TYPE_UNKNOWN;
-                        } break;
-                    case 64:
-                        switch (hparams.n_embd) {
-                            case 2560: type = LLM_TYPE_3B; break;
-                            case 4096: type = LLM_TYPE_7B; break;
-                            default: type = LLM_TYPE_UNKNOWN;
-                        } break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_JAMBA:
-            {
-                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
-                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
-                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
-                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
+    // Ensure we have enough capacity for the maximum backend buffer we will potentially create
+    const size_t n_max_backend_buffer = ml.ctx_map.size() * ml.files.size();
+    pimpl->ctxs_bufs.reserve(n_max_backend_buffer);
 
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    for (auto & [buft, ctx_ptr] : ml.ctx_map) {
+        ggml_context * ctx = ctx_ptr.get();
 
-                for (uint32_t i = 0; i < hparams.n_layer; ++i) {
-                    hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
-                }
+        // skip contexts without tensors
+        if (ggml_get_first_tensor(ctx) == nullptr) {
+            continue;
+        }
 
-                switch (hparams.n_layer) {
-                    // TODO: Jamba layers are a bit heterogeneous, so naming this is hard.
-                    case 12: // 900M  8x???M
-                    case 32: // 51B  16x?B
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_XVERSE:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                switch (hparams.n_layer) {
-                    case 32: type = LLM_TYPE_7B; break;
-                    case 40: type = LLM_TYPE_13B; break;
-                    case 80: type = LLM_TYPE_65B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_COMMAND_R:
-            {
-                ml.get_key(LLM_KV_LOGIT_SCALE,             hparams.f_logit_scale, false);
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-                switch (hparams.n_layer) {
-                    case 40: type = LLM_TYPE_35B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_COHERE2:
-            {
-                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-                uint32_t swa_period = 4;
-                ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
-                hparams.set_swa_pattern(swa_period);
-                hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
-                hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
-
-                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,       hparams.rope_freq_base_train_swa, false);
-                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
-                ml.get_key(LLM_KV_LOGIT_SCALE,              hparams.f_logit_scale);
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,  hparams.f_norm_eps);
-                switch (hparams.n_layer) {
-                    case 32: type = LLM_TYPE_8B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_DBRX:
-        {
-            ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-            ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV,     hparams.f_clamp_kqv);
+        llama_buf_map buf_map;
+        buf_map.reserve(n_max_backend_buffer);
 
-            switch (hparams.n_layer) {
-                case 40: type = LLM_TYPE_16x12B; break;
-                default: type = LLM_TYPE_UNKNOWN;
+        // check if it is possible to use buffer_from_host_ptr with this buffer type
+        ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
+        if (!dev) {
+            // FIXME: workaround for CPU backend buft having a NULL device
+            dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+            if (!dev) {
+                throw std::runtime_error(format("%s: no CPU backend found", __func__));
             }
-        } break;
-        case LLM_ARCH_OLMO:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-                ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV,     hparams.f_clamp_kqv, false);
-
-                switch (hparams.n_layer) {
-                    case 22: type = LLM_TYPE_1B; break;
-                    case 32: type = LLM_TYPE_7B; break;
-                    case 80: type = LLM_TYPE_70B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_OLMO2:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-
-                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
-                if (found_swa && hparams.n_swa > 0) {
-                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-                    uint32_t swa_period = 4;
-                    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
-                    hparams.set_swa_pattern(swa_period);
-
-                    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
-                    hparams.rope_freq_scale_train_swa = 1.0; // See olmo2.cpp
-                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
-                } else {
-                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
-                }
+        }
+        ggml_backend_dev_props props;
+        ggml_backend_dev_get_props(dev, &props);
+        bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
+        bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
 
-                switch (hparams.n_layer) {
-                    case 16: type = LLM_TYPE_1B; break;
-                    case 32: type = LLM_TYPE_7B; break;
-                    case 40: type = LLM_TYPE_13B; break;
-                    case 64: type = LLM_TYPE_32B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_SEED_OSS:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                switch (hparams.n_layer) {
-                    case 64: type = LLM_TYPE_36B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_OLMOE:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                switch (hparams.n_layer) {
-                    case 16: type = LLM_TYPE_A1_7B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
+        std::vector<ggml_backend_buffer_ptr> bufs;
+        if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
+            GGML_ASSERT(!ml.no_alloc);
+            for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
+                // only the mmap region containing the tensors in the model is mapped to the backend buffer
+                // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer,
+                //     then we could just use metal for all layers
+                // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
+                void * addr = nullptr;
+                size_t first, last; // NOLINT
+                ml.get_mapping_range(&first, &last, &addr, idx, ctx);
+                if (first >= last) {
+                    continue;
                 }
-            } break;
-        case LLM_ARCH_OPENELM:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-
-                switch (hparams.n_layer) {
-                case 16: type = LLM_TYPE_270M; break;
-                case 20: type = LLM_TYPE_450M; break;
-                case 28: type = LLM_TYPE_1B; break;
-                case 36: type = LLM_TYPE_3B; break;
-                default: type = LLM_TYPE_UNKNOWN;
+                const size_t max_size = ggml_get_max_tensor_size(ctx);
+                ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
+                if (buf == nullptr) {
+                    throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
                 }
-            } break;
-        case LLM_ARCH_GPTNEOX:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-                ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL,   hparams.use_par_res);
-                switch (hparams.n_layer) {
-                    case 6:
-                        switch (hparams.n_ff()) {
-                            case 512:  type = LLM_TYPE_14M; break;
-                            case 2048: type = LLM_TYPE_70M; break;
-                            default:   type = LLM_TYPE_UNKNOWN;
-                        } break;
-                    case 12:
-                        switch (hparams.n_ff()) {
-                            case 3072: type = LLM_TYPE_160M; break;
-                            default: type = LLM_TYPE_UNKNOWN;
-                        } break;
-                    case 16:
-                        switch (hparams.n_ff()) {
-                            case 8192: type = LLM_TYPE_1B; break;
-                            default: type = LLM_TYPE_UNKNOWN;
-                        } break;
-                    case 24:
-                        switch (hparams.n_ff()) {
-                            case 4096: type = LLM_TYPE_410M; break;
-                            case 8192: type = LLM_TYPE_1_4B; break;
-                            default: type = LLM_TYPE_UNKNOWN;
-                        } break;
-                    case 32:
-                        switch (hparams.n_ff()) {
-                            case 10240: type = LLM_TYPE_2_8B; break;
-                            case 16384: type = LLM_TYPE_6_9B; break;
-                            default: type = LLM_TYPE_UNKNOWN;
-                        } break;
-                    case 36:
-                        switch (hparams.n_ff()) {
-                            case 20480: type = LLM_TYPE_12B; break;
-                            default: type = LLM_TYPE_UNKNOWN;
-                        } break;
-                    case 44:
-                        switch (hparams.n_ff()) {
-                            case 24576: type = LLM_TYPE_20B; break;
-                            default: type = LLM_TYPE_UNKNOWN;
-                        } break;
-                    default: type = LLM_TYPE_UNKNOWN;
+                bufs.emplace_back(buf);
+                buf_map.emplace(idx, buf);
+            }
+        } else {
+            ggml_backend_buffer_t buf;
+            if (ml.no_alloc) {
+                buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
+                for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
+                    t->buffer = buf; // set dummy buffer for weights so that the backend scheduler won't try to allocate them
                 }
-            } break;
-        case LLM_ARCH_ARCTIC:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+            } else {
+                buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); // real buffer
+            }
+            if (buf == nullptr) {
+                throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
+            }
+            if (use_mlock && ggml_backend_buffer_is_host(buf)) {
+                pimpl->mlock_bufs.emplace_back(new llama_mlock);
+                auto & mlock_buf = pimpl->mlock_bufs.back();
+                mlock_buf->init   (ggml_backend_buffer_get_base(buf));
+                mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
+            }
+            bufs.emplace_back(buf);
+            for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
+                buf_map.emplace(idx, buf);
+            }
+        }
 
-                if (hparams.n_expert == 128) {
-                    switch (hparams.n_layer) {
-                        case 35: type = LLM_TYPE_10B_128x3_66B; break;
-                        default: type = LLM_TYPE_UNKNOWN;
-                    }
-                } else {
-                    type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_DEEPSEEK:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
-                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
-                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
-
-                switch (hparams.n_ff_exp) {
-                    case 1408: type = LLM_TYPE_16B; break;
-                    case 1792: type = LLM_TYPE_20B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_DEEPSEEK2:
-        case LLM_ARCH_MISTRAL4:
-            {
-                // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B, Kanana-2-30B-A3B
-                const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26 || (hparams.n_layer == 48 && n_vocab == 128256));
+        for (auto & buf : bufs) {
+            // indicate that this buffer contains weights
+            // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
+            ggml_backend_buffer_set_usage(buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+        }
 
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
-                if (!is_lite) {
-                    ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
-                }
-                ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,     hparams.n_lora_kv);
-                ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA,   hparams.n_embd_head_k_mla_impl, false);
-                ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl, false);
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
-                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,        hparams.n_expert_shared);
-                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,       hparams.expert_weights_scale, false);
-                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,        hparams.expert_weights_norm, false);
-                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,         hparams.expert_gating_func, false);
-                if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
-                    // for compatibility with existing DeepSeek V2 and V2.5 GGUFs
-                    // that have no expert_gating_func model parameter set
-                    if ((hparams.n_layer == 47 || hparams.n_layer == 48) && n_vocab == 154880) {
-                        // GLM 4.7 Lite
-                        hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
-                    } else {
-                        hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
-                    }
-                }
+        pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), std::move(bufs));
 
-                if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false)) {
-                    // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
-                    // cancel the factor from the convert script
-                    hparams.rope_yarn_log_mul /= 0.1f;
-                }
+        ctx_buf_maps.emplace_back(ctx, buf_map);
+    }
 
-                // (optional) temperature tuning - used by mistral-large
-                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE,  hparams.f_attn_temp_scale,       false);
-                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false); // FIXME why not use temperature_length?
+    if (llama_supports_gpu_offload()) {
+        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
 
-                hparams.f_attn_temp_offset = 0.0f;
+        int n_repeating = n_gpu;
+        if (n_repeating > 0) {
+            LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
+            n_repeating--;
+        }
+        LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_repeating);
 
-                switch (hparams.n_layer) {
-                    case 27: type = LLM_TYPE_16B; break;
-                    case 47: type = LLM_TYPE_30B_A3B; break;
-                    case 60: type = LLM_TYPE_236B; break;
-                    case 61: type = LLM_TYPE_671B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_DEEPSEEK2OCR:
-            {
-                // similar to deepseek2, but without MLA
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
-                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
-                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
-                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
-                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
-
-                if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
-                    hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
-                }
+        const int max_backend_supported_layers = hparams.n_layer + 1;
+        const int max_offloadable_layers       = hparams.n_layer + 1;
 
-                switch (hparams.n_layer) {
-                    case 12: type = LLM_TYPE_3B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_PLM:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
-                switch (hparams.n_layer) {
-                    case 32: type = LLM_TYPE_1_8B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_CHATGLM:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                switch (hparams.n_layer) {
-                    case 28: {
-                        if (hparams.n_head(0) == 16) {
-                            type = LLM_TYPE_1_5B;
-                        } else {
-                            type = LLM_TYPE_6B;
-                        }
-                    } break;
-                    case 40: {
-                        if (hparams.n_head(0) == 24) {
-                            type = LLM_TYPE_4B;
-                        } else {
-                            type = LLM_TYPE_9B;
-                        }
-                    } break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_GLM4:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,    hparams.f_norm_rms_eps);
-                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
-
-                // NextN/MTP parameters (GLM-OCR)
-                ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
-                GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
-
-                // TODO: when MTP is implemented, this should probably be updated if needed
-                hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
-
-                switch (hparams.n_layer) {
-                    case 17: type = LLM_TYPE_1B; break; // GLM-OCR
-                    case 40: type = LLM_TYPE_9B; break;
-                    case 61: type = LLM_TYPE_32B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_GLM4_MOE:
-            {
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,     hparams.n_ff_exp);
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,    hparams.f_norm_rms_eps);
-                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
-
-                // MoE parameters
-                ml.get_key(LLM_KV_EXPERT_COUNT,                hparams.n_expert);
-                ml.get_key(LLM_KV_EXPERT_USED_COUNT,           hparams.n_expert_used);
-                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
-                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
-                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
-                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
-
-                // Expert gating function (GLM-4.5 uses sigmoid)
-                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
-                if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
-                    hparams.expert_gating_func =  LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
-                }
-
-                // NextN/MTP parameters
-                ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,        hparams.nextn_predict_layers, false);
-                GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
-
-                // TODO: when MTP is implemented, this should probably be updated if needed
-                hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
+        LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
+    }
 
-                switch (hparams.n_layer) {
-                    case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
-                    case 48: type = LLM_TYPE_102B_A12B; break; // Solar Open
-                    case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_GLM_DSA:
-            {
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,     hparams.n_ff_exp);
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,    hparams.f_norm_rms_eps);
-                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
-
-                // MoE parameters
-                ml.get_key(LLM_KV_EXPERT_COUNT,                hparams.n_expert);
-                ml.get_key(LLM_KV_EXPERT_USED_COUNT,           hparams.n_expert_used);
-                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
-                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
-                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
-                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
-
-                // deepseek MLA parameters
-                ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK,      hparams.n_lora_q);
-                ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,     hparams.n_lora_kv);
-                ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA,   hparams.n_embd_head_k_mla_impl, false);
-                ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl, false);
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
-                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,        hparams.n_expert_shared);
-
-                // DSA parameters
-                ml.get_key(LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, hparams.indexer_n_head);
-                ml.get_key(LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, hparams.indexer_head_size);
-                ml.get_key(LLM_KV_ATTENTION_INDEXER_TOP_K,      hparams.indexer_top_k);
-
-                // Expert gating function (GLM-4.5 uses sigmoid)
-                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
-                if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
-                    hparams.expert_gating_func =  LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
-                }
+    // print memory requirements per buffer type
+    for (auto & [_, bufs] : pimpl->ctxs_bufs) {
+        for (auto & buf: bufs) {
+            LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n",
+                __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
+        }
+    }
 
-                // NextN/MTP parameters
-                ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,        hparams.nextn_predict_layers, false);
-                GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
+    if (ml.no_alloc) {
+        return true;
+    }
 
-                // TODO: when MTP is implemented, this should probably be updated if needed
-                hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
+    // load tensor data
+    for (auto & [ctx, buf_map] : ctx_buf_maps) {
+        if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
+            return false;
+        }
+    }
 
-                switch (hparams.n_layer) {
-                    case 79: type = LLM_TYPE_744B_A40B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_BITNET:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    if (use_mmap_buffer) {
+        for (auto & mapping : ml.mappings) {
+            pimpl->mappings.emplace_back(std::move(mapping));
+        }
+    }
 
-                switch (hparams.n_layer) {
-                    case 26: type = LLM_TYPE_3B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_T5:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,      hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
+    return true;
+}
 
-                uint32_t dec_start_token_id;
-                if (ml.get_key(LLM_KV_DECODER_START_TOKEN_ID, dec_start_token_id, false)) {
-                    hparams.dec_start_token_id = dec_start_token_id;
-                }
+ggml_tensor * llama_model_base::create_tensor(llama_model_loader & ml, const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) {
+    const buft_list_t * buft_list_layer = tn.bid == -1 ? nullptr : pimpl->dev_layer.at(tn.bid).buft_list;
+    return ml.create_tensor(
+        hparams, &pimpl->cpu_buft_list, pimpl->dev_input.buft_list, pimpl->dev_output.buft_list, buft_list_layer,
+        tn, ne, flags);
+}
 
-                hparams.dec_n_layer = hparams.n_layer;
-                ml.get_key(LLM_KV_DECODER_BLOCK_COUNT, hparams.dec_n_layer, false);
-
-                switch (hparams.n_layer) {
-                    case 6:  type = LLM_TYPE_60M;  break; // t5-small
-                    case 8:  type = LLM_TYPE_80M;  break; // flan-t5-small
-                    case 12:
-                        switch (hparams.n_ff()) {
-                            case 3072: type = LLM_TYPE_220M; break; // t5-base
-                            case 2048: type = LLM_TYPE_250M; break; // flan-t5-base
-                            default: type = LLM_TYPE_UNKNOWN;
-                        } break;
-                    case 24:
-                        switch (hparams.n_ff()) {
-                            case 4096:  type = LLM_TYPE_770M; break; // t5-large
-                            case 2816:  type = LLM_TYPE_780M; break; // flan-t5-large
-                            case 16384: type = LLM_TYPE_3B;   break; // t5-3b
-                            case 5120:  type = LLM_TYPE_3B;   break; // flan-t5-xl
-                            case 65536: type = LLM_TYPE_11B;  break; // t5-11b
-                            case 10240: type = LLM_TYPE_11B;  break; // flan-t5-xxl
-                            default: type = LLM_TYPE_UNKNOWN;
-                        } break;
-                    default: type = LLM_TYPE_UNKNOWN;
-               }
-            } break;
-        case LLM_ARCH_T5ENCODER:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
-                type = LLM_TYPE_UNKNOWN;
-            } break;
-        case LLM_ARCH_JAIS:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-                ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias, false);
-
-                switch (hparams.n_layer) {
-                    case 24: type = LLM_TYPE_1_3B; break;
-                    case 40: type = LLM_TYPE_13B; break;
-                    /* TODO: add variants */
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_JAIS2:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+std::string llama_model::arch_name() const {
+    return llm_arch_name(arch);
+}
 
-                switch (hparams.n_layer) {
-                    case 32: type = LLM_TYPE_8B; break;
-                    case 68: type = LLM_TYPE_70B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_NEMOTRON:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
-                switch (hparams.n_layer) {
-                    case 32: type = LLM_TYPE_4B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_NEMOTRON_H:
-        case LLM_ARCH_NEMOTRON_H_MOE:
-            {
-                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
-                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
-                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
-                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
-                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
-
-                // A layer is recurrent IFF the n_head_kv value is set to 0 and
-                // the n_ff value is set to 0
-                for (uint32_t i = 0; i < hparams.n_layer; ++i) {
-                    hparams.recurrent_layer_arr[i] = (hparams.n_head_kv(i) == 0 && hparams.n_ff(i) == 0);
-                }
+std::string llama_model::type_name() const {
+    return llm_type_name(type);
+}
 
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+std::string llama_model::desc() const {
+    return pimpl->desc_str;
+}
 
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp,        false);
-                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp,      false);
-                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,               hparams.n_expert_shared, false);
-                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,               hparams.expert_weights_norm, false);
-                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
-                ml.get_key(LLM_KV_MOE_LATENT_SIZE,                   hparams.moe_latent_size, false);
+size_t llama_model::size() const {
+    return pimpl->n_bytes;
+}
 
-                switch (hparams.n_layer) {
-                    case 52: type = LLM_TYPE_31B_A3_5B; break; // Nemotron-H_MOE 31B
-                    case 56: type = LLM_TYPE_9B; break;
-                    case 88: type = LLM_TYPE_120B_A12B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_EXAONE:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+size_t llama_model::n_tensors() const {
+    return tensors_by_name.size();
+}
 
-                switch (hparams.n_layer) {
-                    case 32: type = LLM_TYPE_8B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_EXAONE4:
-            {
-                if (hparams.n_layer == 64) {    // 32B
-                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-                    hparams.n_swa = 4096;
-                    uint32_t swa_period = 4;
-                    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
-                    hparams.set_swa_pattern(swa_period);
-
-                    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
-                    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
-                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
-                }
+size_t llama_model::n_devices() const {
+    return devices.size();
+}
 
-                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa, false);
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+const float * llama_model::tensor_split() const {
+    return params.tensor_split;
+}
 
-                switch (hparams.n_layer) {
-                    case 30: type = LLM_TYPE_1_2B; break;
-                    case 64: type = LLM_TYPE_32B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_EXAONE_MOE:
-            {
-                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-                hparams.n_swa = 128;
-                uint32_t swa_period = 4;
-                ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
-                hparams.set_swa_pattern(swa_period);
-                hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
-                hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
-
-                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,                hparams.rope_freq_base_train_swa, false);
-                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,          hparams.n_swa);
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,               hparams.n_expert_shared, false);
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
-                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
-                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func);
-                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
-                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,               hparams.expert_weights_norm, false);
-                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead, false);
-
-                ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,              hparams.nextn_predict_layers, false);
-                GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
-
-                switch (hparams.n_layer) {
-                    case 32: type = LLM_TYPE_30B_A3B; break;
-                    case 48:
-                    case 49: type = LLM_TYPE_235B_A22B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_RWKV6:
-        case LLM_ARCH_RWKV6QWEN2:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,     hparams.f_norm_eps, false);
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
-                ml.get_key(LLM_KV_WKV_HEAD_SIZE,               hparams.wkv_head_size);
-                ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM,          hparams.time_mix_extra_dim);
-                ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM,        hparams.time_decay_extra_dim);
-                ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS,      hparams.rescale_every_n_layers, false);
-                ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT,           hparams.token_shift_count, false);
-
-                switch (hparams.n_layer) {
-                    case 24: type = LLM_TYPE_1_6B; break;
-                    case 32:
-                        switch (hparams.n_embd) {
-                            case 2560: type = LLM_TYPE_3B; break;
-                            case 4096: type = LLM_TYPE_7B; break;
-                            default: type = LLM_TYPE_UNKNOWN;
-                        } break;
-                    case 61: type = LLM_TYPE_14B; break;
-                    case 64: type = LLM_TYPE_32B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_RWKV7:
-        case LLM_ARCH_ARWKV7:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,                hparams.f_norm_eps, false);
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,            hparams.f_norm_rms_eps, false);
-                ml.get_key(LLM_KV_WKV_HEAD_SIZE,                          hparams.wkv_head_size);
-                ml.get_key(LLM_KV_ATTENTION_DECAY_LORA_RANK,              hparams.n_lora_decay);
-                ml.get_key(LLM_KV_ATTENTION_ICLR_LORA_RANK,               hparams.n_lora_iclr);
-                ml.get_key(LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK, hparams.n_lora_value_res_mix);
-                ml.get_key(LLM_KV_ATTENTION_GATE_LORA_RANK,               hparams.n_lora_gate, false);
-                ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT,                      hparams.token_shift_count, false);
-
-                switch (hparams.n_layer) {
-                    case 12:
-                        switch (hparams.n_embd) {
-                            case 768: type = LLM_TYPE_190M; break;
-                            default: type = LLM_TYPE_UNKNOWN;
-                        } break;
-                    case 24:
-                        switch (hparams.n_embd) {
-                            case 1024: type = LLM_TYPE_450M; break;
-                            case 2048: type = LLM_TYPE_1_5B; break;
-                            default: type = LLM_TYPE_UNKNOWN;
-                        } break;
-                    case 28:
-                        switch (hparams.n_embd) {
-                            case 1536: type = LLM_TYPE_1_5B; break;
-                            case 3584: type = LLM_TYPE_7B; break;
-                            default: type = LLM_TYPE_UNKNOWN;
-                        } break;
-                    case 32:
-                        switch (hparams.n_embd) {
-                            case 2560: type = LLM_TYPE_2_9B; break;
-                            case 4096: type = LLM_TYPE_7B; break;
-                            default: type = LLM_TYPE_UNKNOWN;
-                        } break;
-                    case 61:
-                        switch (hparams.n_embd) {
-                            case 4096: type = LLM_TYPE_14B; break;
-                            default: type = LLM_TYPE_UNKNOWN;
-                        } break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_GRANITE:
-        case LLM_ARCH_GRANITE_MOE:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_LOGIT_SCALE,                 hparams.f_logit_scale);
-                ml.get_key(LLM_KV_RESIDUAL_SCALE,              hparams.f_residual_scale, false);
-                ml.get_key(LLM_KV_EMBEDDING_SCALE,             hparams.f_embedding_scale, false);
-                ml.get_key(LLM_KV_ATTENTION_SCALE,             hparams.f_attention_scale, false);
-
-                // Granite uses rope_finetuned as a switch for rope, so default to true
-                bool rope_finetuned = true;
-                ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
-                hparams.rope_finetuned = rope_finetuned;
-
-                switch (hparams.n_layer) {
-                    case 32: type = LLM_TYPE_3B; break;
-                    case 40: type = LLM_TYPE_3B; break;
-                    // Add additional layer/vocab/etc checks here for other model sizes
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
+uint32_t llama_model::n_gpu_layers() const {
+    return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer + 1;
+}
 
-                // For Granite MoE Shared
-                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
-            } break;
-        case LLM_ARCH_GRANITE_HYBRID:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_LOGIT_SCALE,                 hparams.f_logit_scale, /* required */ false);
-                ml.get_key(LLM_KV_RESIDUAL_SCALE,              hparams.f_residual_scale, /* required */ false);
-                ml.get_key(LLM_KV_EMBEDDING_SCALE,             hparams.f_embedding_scale, /* required */ false);
-                ml.get_key(LLM_KV_ATTENTION_SCALE,             hparams.f_attention_scale, /* required */ false);
-
-                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
-                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
-                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
-                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
-                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
-
-                // Granite uses rope_finetuned as a switch for rope, so default to true
-                bool rope_finetuned = true;
-                ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
-                hparams.rope_finetuned = rope_finetuned;
-
-                // A layer is recurrent IFF the n_head_kv value is set to 0
-                for (uint32_t i = 0; i < hparams.n_layer; ++i) {
-                    hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
-                }
+llama_split_mode llama_model::split_mode() const {
+    return params.split_mode;
+}
 
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
+    std::map<ggml_backend_buffer_type_t, size_t> ret;
+    for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) {
+        if (hparams.no_alloc) {
+            GGML_ASSERT(bufs.size() == 1);
+            ggml_backend_buffer_t buf = bufs[0].get();
+            GGML_ASSERT(ggml_backend_buffer_get_base(buf) == nullptr);
+            ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf);
+            ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft);
+        } else {
+            for (const auto & buf : bufs) {
+                // GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); // multi_buffer does not have a defined base
+                ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
+            }
+        }
+    }
+    return ret;
+}
 
-                switch (hparams.n_embd) {
-                    case 768: type = LLM_TYPE_350M; break;
-                    case 1536: type = (hparams.n_ff() == 512 ? LLM_TYPE_7B_A1B : LLM_TYPE_1B); break;
-                    case 2048: case 2560: type = LLM_TYPE_3B; break;
-                    case 4096: type = LLM_TYPE_32B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
+uint64_t llama_model::n_elements() const {
+    return pimpl->n_elements;
+}
 
-                // For Granite MoE Shared
-                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
-            } break;
-        case LLM_ARCH_CHAMELEON:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                hparams.f_norm_eps = 1e-5;  // eps for qk-norm, torch default
-                ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm, false);
-
-                switch (hparams.n_layer) {
-                    case 32: type = LLM_TYPE_7B; break;
-                    case 48: type = LLM_TYPE_34B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-               }
-            } break;
-        case LLM_ARCH_WAVTOKENIZER_DEC:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
-                ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS,    hparams.f_norm_group_eps);
-                ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
-            } break;
-        case LLM_ARCH_BAILINGMOE:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
-                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
-                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
-                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
-
-                switch (hparams.n_layer) {
-                    case 28: type = LLM_TYPE_16B; break;
-                    case 88: type = LLM_TYPE_290B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_BAILINGMOE2:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead, false);
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
-                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
-                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,               hparams.n_expert_shared);
-                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
-                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,               hparams.expert_weights_norm, false);
-                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func);
-                ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,              hparams.nextn_predict_layers, false);
-                GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
-
-                // TODO: when MTP is implemented, this should probably be updated if needed
-                hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
-
-                switch (hparams.n_layer) {
-                    case 20: type = LLM_TYPE_16B_A1B; break;
-                    case 21: type = LLM_TYPE_16B_A1B; break;
-                    case 32: type = LLM_TYPE_100B_A6B; break;
-                    case 33: type = LLM_TYPE_100B_A6B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_DOTS1:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
-                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
-                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
-                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
-                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
-                switch (hparams.n_layer) {
-                    case 62: type = LLM_TYPE_142B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_ERNIE4_5:
-        case LLM_ARCH_ERNIE4_5_MOE:
-        case LLM_ARCH_PADDLEOCR:
-            {
-                // paddleocr need mrope_section
-                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
-
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                if (arch == LLM_ARCH_ERNIE4_5_MOE) {
-                    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
-                    ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
-                    ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP,         hparams.n_moe_layer_step);
-                    ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead, false);
-                }
+void llama_model::print_info() const {
+    const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
 
-                switch (hparams.n_layer) {
-                    case 18: type = LLM_TYPE_0_3B; break;
-                    case 28: type = LLM_TYPE_21B_A3B; break;
-                    case 54: type = LLM_TYPE_300B_A47B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_FALCON_H1:
-            {
-                // Common parameters
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-
-                // SSM parameters
-                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
-                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
-                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
-                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
-                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
-
-                std::fill(hparams.recurrent_layer_arr.begin(), hparams.recurrent_layer_arr.end(), true);
-
-                switch (hparams.n_layer) {
-                    case 36:
-                        type = LLM_TYPE_0_5B; break;
-                    case 24:
-                        type = LLM_TYPE_1_5B; break;
-                    case 66:
-                        type = LLM_TYPE_1B; break;
-                    case 32:
-                        type = LLM_TYPE_3B; break;
-                    case 44:
-                        type = LLM_TYPE_7B; break;
-                    case 72:
-                        type = LLM_TYPE_34B; break;
-                    default:
-                        type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_HUNYUAN_MOE:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
-                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
+    auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
+        bool is_var = false;
 
-                switch (hparams.n_layer) {
-                    case 32: type = LLM_TYPE_A13B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_HUNYUAN_VL:
-        case LLM_ARCH_HUNYUAN_DENSE:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
-
-                // XDRoPE / NTK-aware scaling: base = rope_theta * alpha^(dim / (dim - 2))
-                if (hparams.rope_scaling_alpha > 0.0f) {
-                    const int dim = hparams.n_embd_head_k();
-                    hparams.rope_freq_base_train = hparams.rope_freq_base_train
-                        * powf(hparams.rope_scaling_alpha, (float)dim / (float)(dim - 2));
-                }
+        std::vector<uint32_t> v;
+        for (uint32_t i = 0; i < n; ++i) {
+            v.push_back(f(i));
+            if (v[i] != v[0]) {
+                is_var = true;
+            }
+        }
 
-                switch (hparams.n_embd) {
-                    case 1024: type = LLM_TYPE_0_5B; break;
-                    case 2048: type = LLM_TYPE_1_8B; break;
-                    case 3072: type = LLM_TYPE_4B; break;
-                    case 4096: type = LLM_TYPE_7B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_SMOLLM3:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                hparams.n_no_rope_layer_step = 4;
+        std::stringstream ss;
 
-                switch (hparams.n_layer) {
-                    case 36: type = LLM_TYPE_3B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_OPENAI_MOE:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
-                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa);
-
-                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-                uint32_t swa_period = 2;
-                ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
-                hparams.set_swa_pattern(swa_period);
-
-                hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
-                hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
-                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
-
-                switch (hparams.n_layer) {
-                    case 24: type = LLM_TYPE_20B; break;
-                    case 36: type = LLM_TYPE_120B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_LFM2:
-            {
-                ml.get_key(LLM_KV_SHORTCONV_L_CACHE,           hparams.n_shortconv_l_cache);
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                for (uint32_t il = 0; il < hparams.n_layer; ++il) {
-                    hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
-                }
-                hparams.n_layer_dense_lead = hparams.n_layer;
-                switch (hparams.n_ff()) {
-                    case  4608: type = LLM_TYPE_350M; break;
-                    case  6912: type = LLM_TYPE_700M; break;
-                    case  8192: type = LLM_TYPE_1_2B; break;
-                    case 10752: type = LLM_TYPE_2_6B; break;
-                    default:    type = LLM_TYPE_UNKNOWN;
-                }
-                if (const auto is_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); is_swa && hparams.n_swa > 0) {
-                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-                    for (uint32_t il = 0; il < hparams.n_layer; ++il) {
-                        hparams.swa_layers[il] = !hparams.recurrent_layer_arr[il];
-                    }
-                }
-            } break;
-        case LLM_ARCH_LFM2MOE:
-            {
-                ml.get_key(LLM_KV_SHORTCONV_L_CACHE,           hparams.n_shortconv_l_cache);
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
-                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func);
-
-                for (uint32_t il = 0; il < hparams.n_layer; ++il) {
-                    hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
+        if (is_var) {
+            ss << "[";
+            for (uint32_t i = 0; i < n; ++i) {
+                ss << v[i];
+                if (i < n - 1) {
+                    ss << ", ";
                 }
+            }
+            ss << "]";
+        } else {
+            ss << v[0];
+        }
 
-                switch (hparams.n_layer) {
-                    case 24: type = LLM_TYPE_8B_A1B;  break;
-                    case 40: type = LLM_TYPE_24B_A2B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_SMALLTHINKER:
-            {
-                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
-
-                if (found_swa && hparams.n_swa > 0) {
-                    hparams.swa_type    = LLAMA_SWA_TYPE_STANDARD;
-                    hparams.n_swa       = 4096;
-                    uint32_t swa_period = 4;
-                    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
-                    hparams.set_swa_pattern(swa_period, true);
-
-                    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
-                    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
-                    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
-                } else {
-                    hparams.swa_type             = LLAMA_SWA_TYPE_NONE;
-                    hparams.n_no_rope_layer_step = hparams.n_layer;
-                }
+        return ss.str();
+    };
 
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp, false);
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
+    // hparams
+    LLAMA_LOG_INFO("%s: arch                  = %s\n",     __func__, arch_name().c_str());
+    LLAMA_LOG_INFO("%s: vocab_only            = %d\n",     __func__, hparams.vocab_only);
+    LLAMA_LOG_INFO("%s: no_alloc              = %d\n",     __func__, hparams.no_alloc);
 
-                switch (hparams.n_layer) {
-                    case 32: type = LLM_TYPE_4B;  break;
-                    case 52: type = LLM_TYPE_20B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_GROVEMOE:
-            {
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
-                ml.get_key(LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH,  hparams.n_ff_chexp, false);
-                ml.get_key(LLM_KV_EXPERT_GROUP_SCALE,                hparams.expert_group_scale);
-                ml.get_key(LLM_KV_EXPERTS_PER_GROUP,                 hparams.n_group_experts);
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
-
-                switch (hparams.n_layer) {
-                    case 48: type = LLM_TYPE_30B_A3B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_APERTUS:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_N,        hparams.xielu_alpha_n, hparams.n_layer);
-                ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_P,        hparams.xielu_alpha_p, hparams.n_layer);
-                ml.get_key_or_arr(LLM_KV_XIELU_BETA,           hparams.xielu_beta,    hparams.n_layer);
-                ml.get_key_or_arr(LLM_KV_XIELU_EPS,            hparams.xielu_eps,     hparams.n_layer);
-
-                switch (hparams.n_layer) {
-                    case 32: type = LLM_TYPE_8B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_MINIMAX_M2:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,  hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,   hparams.n_ff_exp);
-                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,           hparams.expert_gating_func, false);
+    if (!hparams.vocab_only) {
+        LLAMA_LOG_INFO("%s: n_ctx_train           = %u\n",     __func__, hparams.n_ctx_train);
+        LLAMA_LOG_INFO("%s: n_embd                = %u\n",     __func__, hparams.n_embd);
+        LLAMA_LOG_INFO("%s: n_embd_inp            = %u\n",     __func__, hparams.n_embd_inp());
+        LLAMA_LOG_INFO("%s: n_layer               = %u\n",     __func__, hparams.n_layer);
+        LLAMA_LOG_INFO("%s: n_head                = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head(il);    }, hparams.n_layer).c_str());
+        LLAMA_LOG_INFO("%s: n_head_kv             = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
+        LLAMA_LOG_INFO("%s: n_rot                 = %u\n",     __func__, hparams.n_rot_full);
+        LLAMA_LOG_INFO("%s: n_swa                 = %u\n",     __func__, hparams.n_swa);
+        LLAMA_LOG_INFO("%s: is_swa_any            = %u\n",     __func__, hparams.is_swa_any());
+        LLAMA_LOG_INFO("%s: n_embd_head_k         = %u\n",     __func__, hparams.n_embd_head_k_full);
+        LLAMA_LOG_INFO("%s: n_embd_head_v         = %u\n",     __func__, hparams.n_embd_head_v_full);
+        LLAMA_LOG_INFO("%s: n_gqa                 = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il);        }, hparams.n_layer).c_str());
+        LLAMA_LOG_INFO("%s: n_embd_k_gqa          = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
+        LLAMA_LOG_INFO("%s: n_embd_v_gqa          = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
+        LLAMA_LOG_INFO("%s: f_norm_eps            = %.1e\n",   __func__, hparams.f_norm_eps);
+        LLAMA_LOG_INFO("%s: f_norm_rms_eps        = %.1e\n",   __func__, hparams.f_norm_rms_eps);
+        LLAMA_LOG_INFO("%s: f_clamp_kqv           = %.1e\n",   __func__, hparams.f_clamp_kqv);
+        LLAMA_LOG_INFO("%s: f_max_alibi_bias      = %.1e\n",   __func__, hparams.f_max_alibi_bias);
+        LLAMA_LOG_INFO("%s: f_logit_scale         = %.1e\n",   __func__, hparams.f_logit_scale);
+        LLAMA_LOG_INFO("%s: f_attn_scale          = %.1e\n",   __func__, hparams.f_attention_scale);
+        LLAMA_LOG_INFO("%s: n_ff                  = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
+        LLAMA_LOG_INFO("%s: n_expert              = %u\n",     __func__, hparams.n_expert);
+        LLAMA_LOG_INFO("%s: n_expert_used         = %u\n",     __func__, hparams.n_expert_used);
+        LLAMA_LOG_INFO("%s: n_expert_groups       = %d\n",     __func__, hparams.n_expert_groups);
+        LLAMA_LOG_INFO("%s: n_group_used          = %d\n",     __func__, hparams.n_group_used);
+        LLAMA_LOG_INFO("%s: causal attn           = %d\n",     __func__, hparams.causal_attn);
+        LLAMA_LOG_INFO("%s: pooling type          = %d\n",     __func__, hparams.pooling_type);
+        LLAMA_LOG_INFO("%s: rope type             = %d\n",     __func__, hparams.rope_type);
+        LLAMA_LOG_INFO("%s: rope scaling          = %s\n",     __func__, rope_scaling_type.c_str());
+        LLAMA_LOG_INFO("%s: freq_base_train       = %.1f\n",   __func__, hparams.rope_freq_base_train);
+        LLAMA_LOG_INFO("%s: freq_scale_train      = %g\n",     __func__, hparams.rope_freq_scale_train);
+        if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
+            LLAMA_LOG_INFO("%s: freq_base_swa         = %.1f\n",   __func__, hparams.rope_freq_base_train_swa);
+            LLAMA_LOG_INFO("%s: freq_scale_swa        = %g\n",     __func__, hparams.rope_freq_scale_train_swa);
+            LLAMA_LOG_INFO("%s: n_embd_head_k_swa     = %u\n",     __func__, hparams.n_embd_head_k_swa);
+            LLAMA_LOG_INFO("%s: n_embd_head_v_swa     = %u\n",     __func__, hparams.n_embd_head_v_swa);
+            LLAMA_LOG_INFO("%s: n_rot_swa             = %u\n",     __func__, hparams.n_rot_swa);
+        }
+        LLAMA_LOG_INFO("%s: n_ctx_orig_yarn       = %u\n",     __func__, hparams.n_ctx_orig_yarn);
+        LLAMA_LOG_INFO("%s: rope_yarn_log_mul     = %.4f\n",   __func__, hparams.rope_yarn_log_mul);
+        LLAMA_LOG_INFO("%s: rope_finetuned        = %s\n",     __func__, hparams.rope_finetuned ? "yes" : "unknown");
+        // MRoPE (Multi-axis Rotary Position Embedding) sections
+        if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
+            LLAMA_LOG_INFO("%s: mrope sections        = [%d, %d, %d, %d]\n", __func__, s[0], s[1], s[2], s[3]);
+        }
+        if (!classifier_labels.empty()) {
+            LLAMA_LOG_INFO("%s: n_cls_out             = %u\n", __func__, hparams.n_cls_out);
 
-                switch (hparams.n_layer) {
-                    case 62: type = LLM_TYPE_230B_A10B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_COGVLM:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                switch (hparams.n_layer) {
-                    case 32: type = LLM_TYPE_13B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_PANGU_EMBED:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                switch (hparams.n_layer) {
-                    case 26: type = LLM_TYPE_1B; break; // openPangu-Embedded-1B-V1.1
-                    case 34: type = LLM_TYPE_7B; break; // openPangu-Embedded-7B-V1.1
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_QWEN3NEXT:
-            {
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp, false);
-                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
-
-                // Load linear attention (gated delta net) parameters
-                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
-                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
-                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
-                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
-                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
-
-                // Mark recurrent layers (linear attention layers)
-                {
-                    uint32_t full_attn_interval = 4;
-                    ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false);
-                    for (uint32_t i = 0; i < hparams.n_layer; ++i) {
-                        hparams.recurrent_layer_arr[i] = ((i + 1) % full_attn_interval != 0);
-                    }
-                }
+            size_t i = 0;
+            for (const auto & label : classifier_labels) {
+                LLAMA_LOG_INFO("%s: cls_label[%2zu]         = %s\n", __func__, i++, label.c_str());
+            }
+        }
 
-                switch (hparams.n_layer) {
-                    case 48: type = LLM_TYPE_80B_A3B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_QWEN35:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
-                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS,    hparams.rope_sections, 4, true);
-
-                // Load linear attention (gated delta net) parameters
-                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
-                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
-                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
-                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
-                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
-
-                // Mark recurrent layers (linear attention layers)
-                {
-                    uint32_t full_attn_interval = 4;
-                    ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false);
-                    for (uint32_t i = 0; i < hparams.n_layer; ++i) {
-                        hparams.recurrent_layer_arr[i] = ((i + 1) % full_attn_interval != 0);
-                    }
-                }
+        if (arch == LLM_ARCH_MAMBA ||
+                arch == LLM_ARCH_MAMBA2 ||
+                arch == LLM_ARCH_JAMBA ||
+                arch == LLM_ARCH_FALCON_H1 ||
+                arch == LLM_ARCH_PLAMO2 ||
+                arch == LLM_ARCH_GRANITE_HYBRID ||
+                arch == LLM_ARCH_QWEN3NEXT ||
+                arch == LLM_ARCH_QWEN35 ||
+                arch == LLM_ARCH_QWEN35MOE ||
+                arch == LLM_ARCH_NEMOTRON_H ||
+                arch == LLM_ARCH_NEMOTRON_H_MOE) {
+            LLAMA_LOG_INFO("%s: ssm_d_conv            = %u\n",     __func__, hparams.ssm_d_conv);
+            LLAMA_LOG_INFO("%s: ssm_d_inner           = %u\n",     __func__, hparams.ssm_d_inner);
+            LLAMA_LOG_INFO("%s: ssm_d_state           = %u\n",     __func__, hparams.ssm_d_state);
+            LLAMA_LOG_INFO("%s: ssm_dt_rank           = %u\n",     __func__, hparams.ssm_dt_rank);
+            LLAMA_LOG_INFO("%s: ssm_n_group           = %u\n",     __func__, hparams.ssm_n_group);
+            LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms        = %d\n",     __func__, hparams.ssm_dt_b_c_rms);
+        }
 
-                switch (hparams.n_layer) {
-                    case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_8B : LLM_TYPE_2B; break;
-                    case 32: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_9B; break;
-                    case 64: type = LLM_TYPE_27B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_QWEN35MOE:
-            {
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp, false);
-                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
-
-                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS,    hparams.rope_sections, 4, true);
-
-                // Load linear attention (gated delta net) parameters
-                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
-                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
-                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
-                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
-                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
-
-                // Mark recurrent layers (linear attention layers)
-                {
-                    uint32_t full_attn_interval = 4;
-                    ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false);
-                    for (uint32_t i = 0; i < hparams.n_layer; ++i) {
-                        hparams.recurrent_layer_arr[i] = ((i + 1) % full_attn_interval != 0);
-                    }
-                }
+        LLAMA_LOG_INFO("%s: model type            = %s\n",     __func__, type_name().c_str());
+        if (pimpl->n_elements >= 1e12) {
+            LLAMA_LOG_INFO("%s: model params          = %.2f T\n", __func__, pimpl->n_elements*1e-12);
+        } else if (pimpl->n_elements >= 1e9) {
+            LLAMA_LOG_INFO("%s: model params          = %.2f B\n", __func__, pimpl->n_elements*1e-9);
+        } else if (pimpl->n_elements >= 1e6) {
+            LLAMA_LOG_INFO("%s: model params          = %.2f M\n", __func__, pimpl->n_elements*1e-6);
+        } else {
+            LLAMA_LOG_INFO("%s: model params          = %.2f K\n", __func__, pimpl->n_elements*1e-3);
+        }
 
-                switch (hparams.n_layer) {
-                    case 40: type = LLM_TYPE_35B_A3B; break;
-                    case 48: type = LLM_TYPE_122B_A10B; break;
-                    case 60: type = LLM_TYPE_397B_A17B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_MISTRAL3:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
+        // general kv
+        LLAMA_LOG_INFO("%s: general.name          = %s\n",    __func__, name.c_str());
 
-                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast,    false);
-                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow,    false);
-                ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL,   hparams.rope_yarn_log_mul, false);
+        if (arch == LLM_ARCH_DEEPSEEK) {
+            LLAMA_LOG_INFO("%s: n_layer_dense_lead    = %d\n",     __func__, hparams.n_layer_dense_lead);
+            LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
+            LLAMA_LOG_INFO("%s: n_expert_shared       = %d\n",     __func__, hparams.n_expert_shared);
+            LLAMA_LOG_INFO("%s: expert_weights_scale  = %.1f\n",   __func__, hparams.expert_weights_scale);
+        }
 
-                hparams.f_attn_temp_offset = 0.0f;
+        if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_DEEPSEEK2OCR || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_MISTRAL4) {
+            LLAMA_LOG_INFO("%s: n_layer_dense_lead    = %d\n",     __func__, hparams.n_layer_dense_lead);
+            LLAMA_LOG_INFO("%s: n_lora_q              = %d\n",     __func__, hparams.n_lora_q);
+            LLAMA_LOG_INFO("%s: n_lora_kv             = %d\n",     __func__, hparams.n_lora_kv);
+            LLAMA_LOG_INFO("%s: n_embd_head_k_mla     = %d\n",     __func__, hparams.n_embd_head_k_mla());
+            LLAMA_LOG_INFO("%s: n_embd_head_v_mla     = %d\n",     __func__, hparams.n_embd_head_v_mla());
+            LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
+            LLAMA_LOG_INFO("%s: n_expert_shared       = %d\n",     __func__, hparams.n_expert_shared);
+            LLAMA_LOG_INFO("%s: expert_weights_scale  = %.1f\n",   __func__, hparams.expert_weights_scale);
+            LLAMA_LOG_INFO("%s: expert_weights_norm   = %d\n",     __func__, hparams.expert_weights_norm);
+            LLAMA_LOG_INFO("%s: expert_gating_func    = %s\n",     __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
+        }
 
-                // TODO: maybe add n_attn_temp_floor_scale as a separate KV?
-                if (hparams.f_attn_temp_scale != 0.0f) {
-                    hparams.n_attn_temp_floor_scale = hparams.n_ctx_orig_yarn;
-                    if (hparams.n_attn_temp_floor_scale == 0) {
-                        throw std::runtime_error("invalid n_ctx_orig_yarn for attention temperature scaling");
-                    }
-                }
+        if (arch == LLM_ARCH_QWEN2MOE) {
+            LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
+            LLAMA_LOG_INFO("%s: n_ff_shexp            = %d\n",     __func__, hparams.n_ff_shexp);
+        }
 
-                switch (hparams.n_layer) {
-                    case 26: type = LLM_TYPE_3B; break;
-                    case 34: type = LLM_TYPE_8B; break;
-                    case 40: type = LLM_TYPE_14B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_MIMO2:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+        if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) {
+            LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
+        }
 
-                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+        if (arch == LLM_ARCH_MINICPM ||
+                arch == LLM_ARCH_GRANITE ||
+                arch == LLM_ARCH_GRANITE_MOE ||
+                arch == LLM_ARCH_GRANITE_HYBRID ||
+                arch == LLM_ARCH_NEMOTRON_H_MOE) {
+            LLAMA_LOG_INFO("%s: f_embedding_scale     = %f\n", __func__, hparams.f_embedding_scale);
+            LLAMA_LOG_INFO("%s: f_residual_scale      = %f\n", __func__, hparams.f_residual_scale);
+            LLAMA_LOG_INFO("%s: f_attention_scale     = %f\n", __func__, hparams.f_attention_scale);
+            LLAMA_LOG_INFO("%s: n_ff_shexp            = %d\n", __func__, hparams.n_ff_shexp);
+        }
 
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
-                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,   hparams.n_swa);
-                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,         hparams.rope_freq_base_train_swa, false);
-                ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
+        if (arch == LLM_ARCH_BAILINGMOE) {
+            LLAMA_LOG_INFO("%s: n_layer_dense_lead    = %d\n",     __func__, hparams.n_layer_dense_lead);
+            LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
+            LLAMA_LOG_INFO("%s: n_expert_shared       = %d\n",     __func__, hparams.n_expert_shared);
+            LLAMA_LOG_INFO("%s: expert_weights_scale  = %.1f\n",   __func__, hparams.expert_weights_scale);
+            LLAMA_LOG_INFO("%s: expert_weights_norm   = %d\n",     __func__, hparams.expert_weights_norm);
+        }
 
-                switch (hparams.n_layer) {
-                    case 48: type = LLM_TYPE_310B_A15B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_KIMI_LINEAR:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA,    hparams.n_embd_head_k_mla_impl);
-                ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA,  hparams.n_embd_head_v_mla_impl);
-                ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,      hparams.n_lora_kv);
-                ml.get_key(LLM_KV_SSM_CONV_KERNEL,             hparams.ssm_d_conv);
-                ml.get_key(LLM_KV_KDA_HEAD_DIM,                hparams.n_embd_head_kda);
-
-                // MLA qk_rope_head_dim (for reference)
-                // qk_rope_head_dim = 64, qk_nope_head_dim = 128, qk_head_dim = 192
-
-                // Mark KDA layers as recurrent using n_head_kv pattern (like Jamba)
-                // Set n_head_kv = 0 for KDA layers (recurrent), n_head_kv = n_head for MLA layers (attention)
-                for (uint32_t i = 0; i < hparams.n_layer; ++i) {
-                    hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;  // KDA layers are recurrent
-                }
+        if (arch == LLM_ARCH_BAILINGMOE2) {
+            LLAMA_LOG_INFO("%s: n_layer_dense_lead    = %d\n",     __func__, hparams.n_layer_dense_lead);
+            LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
+            LLAMA_LOG_INFO("%s: n_ff_shexp            = %d\n",     __func__, hparams.n_ff_shexp);
+            LLAMA_LOG_INFO("%s: n_expert_shared       = %d\n",     __func__, hparams.n_expert_shared);
+            LLAMA_LOG_INFO("%s: expert_weights_scale  = %.1f\n",   __func__, hparams.expert_weights_scale);
+            LLAMA_LOG_INFO("%s: expert_weights_norm   = %d\n",     __func__, hparams.expert_weights_norm);
+            LLAMA_LOG_INFO("%s: expert_gating_func    = %s\n",     __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
+            LLAMA_LOG_INFO("%s: nextn_predict_layers  = %d\n",     __func__, hparams.nextn_predict_layers);
+        }
 
-                // MoE parameters - Kimi uses moe_intermediate_size = 1024
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
-                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,               hparams.n_expert_shared);
-                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead, false);
-                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
-                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func);
+        if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
+            LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
+            LLAMA_LOG_INFO("%s: expert_gating_func    = %s\n",     __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
+        }
 
-                switch (hparams.n_layer) {
-                    case 27: type = LLM_TYPE_48B_A3B; break; // Kimi-Linear-48B-A3B
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        case LLM_ARCH_STEP35:
-            {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+        if (arch == LLM_ARCH_GROVEMOE) {
+            LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
+            LLAMA_LOG_INFO("%s: n_ff_chexp            = %d\n",     __func__, hparams.n_ff_chexp);
+            LLAMA_LOG_INFO("%s: n_group_experts       = %d\n",     __func__, hparams.n_group_experts);
+            LLAMA_LOG_INFO("%s: expert_group_scale    = %.2f\n",   __func__, hparams.expert_group_scale);
+        }
+    }
 
-                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+    vocab.print_info();
+}
 
-                // full_attention layer only use half of the RoPE dimensions
-                hparams.n_rot_full = hparams.n_rot_full / 2;
+ggml_backend_dev_t llama_model::dev_layer(int il) const {
+    return pimpl->dev_layer.at(il).dev;
+}
 
-                // MoE + SWA parameters
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
-                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
-                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func, false);
-                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
-                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,               hparams.expert_weights_norm, false);
+ggml_backend_dev_t llama_model::dev_output() const {
+    return pimpl->dev_output.dev;
+}
 
-                // Step35 uses sigmoid gating by default (if not set in GGUF)
-                if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
-                    hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
-                }
+template<typename F>
+static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
+    ggml_init_params params = {
+        /*.mem_size   =*/ ggml_tensor_overhead()*8,
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ true,
+    };
 
-                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,  hparams.n_swa);
-                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,        hparams.rope_freq_base_train_swa, false);
-                ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
-                ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP,   hparams.swiglu_clamp_exp,   hparams.n_layer, false);
-                ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp, hparams.n_layer, false);
+    ggml_context_ptr ctx { ggml_init(params) };
+    if (!ctx) {
+        throw std::runtime_error(format("failed to create ggml context"));
+    }
 
-                switch (hparams.n_layer) {
-                    case 45: type = LLM_TYPE_196B_A11B; break;
-                    default: type = LLM_TYPE_UNKNOWN;
-                }
-            } break;
-        default: throw std::runtime_error("unsupported model architecture: " + arch_name());
+    ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
+    ggml_tensor * op_tensor = fn(ctx.get());
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        if (op_tensor->src[i] != nullptr) {
+            assert(op_tensor->src[i]->buffer == nullptr);
+            op_tensor->src[i]->buffer = buf.get();
+        }
     }
 
-    pimpl->n_bytes = ml.n_bytes;
+    bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
 
-    pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
+    return op_supported;
+}
 
-    if (hparams.f_max_alibi_bias > 0.0f) {
-        hparams.use_alibi = true;
+template<typename F>
+static ggml_backend_buffer_type_t select_buft(const buft_list_t & buft_list, const F & fn) {
+    for (const auto & cur : buft_list) {
+        ggml_backend_dev_t cur_dev = cur.first;
+        ggml_backend_buffer_type_t cur_buft = cur.second;
+        if (buft_supported(cur_buft, cur_dev, fn)) {
+            return cur_buft;
+        }
     }
 
-    hparams.rope_type = llama_model_rope_type(this);
+    throw std::runtime_error(format("no suitable buffer type found"));
 }
 
-void llama_model::load_vocab(llama_model_loader & ml) {
-    const auto kv = LLM_KV(arch);
-
-    vocab.load(ml, kv);
+ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
+    return ::select_buft(
+            *pimpl->dev_layer.at(il).buft_list,
+            [&](ggml_context * ctx) {
+                ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
+                ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
+                return ggml_add(ctx, cur, layer_dir);
+            });
 }
 
-bool llama_model::load_tensors(llama_model_loader & ml) {
-    const auto & split_mode   = params.split_mode;
-    const auto & use_mlock    = params.use_mlock;
-    const auto & tensor_split = params.tensor_split;
-
-    const int n_layer      = hparams.n_layer;
-    const int n_gpu_layers = this->n_gpu_layers();
-
-    const bool use_mmap_buffer = true;
-
-    LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s, direct_io = %s)\n",
-        __func__, ml.use_mmap ? "true" : "false", ml.use_direct_io ? "true" : "false");
-
-    // build a list of buffer types for the CPU and GPU devices
-    pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host);
-    for (const auto & dev : devices) {
-        buft_list_t buft_list = make_gpu_buft_list(dev.dev, split_mode, tensor_split);
-        // add CPU buffer types as a fallback
-        buft_list.insert(buft_list.end(), pimpl->cpu_buft_list.begin(), pimpl->cpu_buft_list.end());
-        pimpl->gpu_buft_list.emplace(dev.dev, std::move(buft_list));
-    }
-
-    ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
-    if (cpu_dev == nullptr) {
-        throw std::runtime_error(format("%s: no CPU backend found", __func__));
-    }
-
-    // calculate the split points
-    bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
-    std::vector<float> splits(n_devices());
-    if (all_zero) {
-        // default split, by free memory
-        for (size_t i = 0; i < n_devices(); ++i) {
-            ggml_backend_dev_t dev = devices[i].dev;
-            size_t total;
-            size_t free;
-            ggml_backend_dev_memory(dev, &free, &total);
-
-            // devices can return 0 bytes for free and total memory if they do not
-            // have any to report. in this case, we will use the host memory as a fallback
-            // fixes: https://github.com/ggml-org/llama.cpp/issues/18577
-            if (free == 0 && total == 0) {
-                ggml_backend_dev_memory(cpu_dev, &free, &total);
-            }
-            splits[i] = free;
-        }
-    } else {
-        std::copy(tensor_split, tensor_split + n_devices(), splits.begin());
-    }
-
-    // sum and normalize the splits to get the split points
-    float split_sum = 0.0f;
-    for (size_t i = 0; i < n_devices(); ++i) {
-        split_sum += splits[i];
-        splits[i] = split_sum;
-    }
-    for (size_t i = 0; i < n_devices(); ++i) {
-        splits[i] /= split_sum;
-    }
-
-    const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0);
-    const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1);
-    auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
-        const bool is_swa = il < int(hparams.n_layer) && hparams.is_swa(il);
-        if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
-            LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
-            return {cpu_dev, &pimpl->cpu_buft_list};
-        }
-        const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
-        auto * dev = devices.at(layer_gpu).dev;
-        LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(dev), is_swa);
-        return {dev, &pimpl->gpu_buft_list.at(dev)};
-    };
-
-    // assign the input layer
-    // there is very little benefit to offloading the input layer, so always keep it on the CPU
-    pimpl->dev_input = { cpu_dev, &pimpl->cpu_buft_list };
+bool llama_model::has_tensor_overrides() const {
+    return pimpl->has_tensor_overrides;
+}
 
-    // assign the repeating layers to the devices according to the splits
-    pimpl->dev_layer.resize(n_layer);
-    for (int il = 0; il < n_layer; ++il) {
-        pimpl->dev_layer[il] = get_layer_buft_list(il);
+const ggml_tensor * llama_model::get_tensor(const char * name) const {
+    auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
+            [name](const std::pair<std::string, ggml_tensor *> & it) {
+                return it.first == name;
+            });
+    if (it == tensors_by_name.end()) {
+        return nullptr;
     }
 
-    // assign the output layer
-    pimpl->dev_output = get_layer_buft_list(n_layer);
-
-    const auto TENSOR_DUPLICATED      = llama_model_loader::TENSOR_DUPLICATED;
-    const auto TENSOR_NOT_REQUIRED    = llama_model_loader::TENSOR_NOT_REQUIRED;
-    const auto TENSOR_SKIP            = llama_model_loader::TENSOR_SKIP;
-    const auto TENSOR_SKIP_IF_VIRTUAL = llama_model_loader::TENSOR_SKIP_IF_VIRTUAL;
-
-    // create tensors for the weights
-    {
-        // note: cast to int64_t since we will use these for the tensor dimensions
-        const int64_t n_head        = hparams.n_head();
-        const int64_t n_head_kv     = hparams.n_head_kv();
-        const int64_t n_embd        = hparams.n_embd;
-        const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa();
-        const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa();
-        const int64_t n_embd_head_k = hparams.n_embd_head_k();
-        const int64_t n_embd_head_v = hparams.n_embd_head_v();
-        const int64_t n_ff          = hparams.n_ff();
-        const int64_t n_embd_gqa    = n_embd_v_gqa;
-        const int64_t n_vocab       = vocab.n_tokens();
-        const int64_t n_token_types = vocab.n_token_types();
-        const int64_t n_rot         = hparams.n_rot();
-        const int64_t n_expert      = hparams.n_expert;
-        const int64_t n_expert_used = hparams.n_expert_used;
-        const int64_t n_ctx_train   = hparams.n_ctx_train;
-
-        if (n_expert > 0 && hparams.n_expert_used == 0) {
-            throw std::runtime_error("model has expert layers but no expert layers are used");
-        }
-
-        auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
-            const buft_list_t * buft_list_layer = tn.bid == -1 ? nullptr : pimpl->dev_layer.at(tn.bid).buft_list;
-            return ml.create_tensor(
-                hparams, &pimpl->cpu_buft_list, pimpl->dev_input.buft_list, pimpl->dev_output.buft_list, buft_list_layer,
-                tn, ne, flags);
-        };
-
-        layers.resize(n_layer);
-
-        // TODO: move to a separate function
-        const auto tn = LLM_TN(arch);
-
-        // helper: try merged gate_up_exps first, fall back to separate gate and up
-        auto create_tensor_gate_up_exps = [&](llama_layer & layer, int bid, int64_t n_embd_, int64_t n_ff_, int64_t n_expert_, int flags) {
-            layer.ffn_gate_up_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_UP_EXPS, "weight", bid), {n_embd_, n_ff_ * 2, n_expert_}, TENSOR_NOT_REQUIRED);
-            if (layer.ffn_gate_up_exps == nullptr) {
-                layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", bid), {n_embd_, n_ff_, n_expert_}, flags);
-                layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", bid), {n_embd_, n_ff_, n_expert_}, flags);
-            }
-        };
-
-        // helper: try to load merged qkv first, fall back to separate q, k, v
-        auto create_tensor_qkv = [&](llama_layer & layer, int bid,
-                int64_t n_embd_, int64_t n_embd_q_, int64_t n_embd_k_, int64_t n_embd_v_,
-                int flags) {
-            const int64_t n_embd_qkv = n_embd_q_ + n_embd_k_ + n_embd_v_;
-            layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", bid), {n_embd_, n_embd_qkv}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
-            if (layer.wqkv) {
-                layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", bid), {n_embd_qkv}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
-            } else {
-                layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", bid), {n_embd_, n_embd_q_}, flags);
-                layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", bid), {n_embd_, n_embd_k_}, flags);
-                layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", bid), {n_embd_, n_embd_v_}, flags);
-                layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", bid), {n_embd_q_}, TENSOR_NOT_REQUIRED);
-                layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", bid), {n_embd_k_}, TENSOR_NOT_REQUIRED);
-                layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", bid), {n_embd_v_}, TENSOR_NOT_REQUIRED);
-            }
-        };
-
-        switch (arch) {
-            case LLM_ARCH_LLAMA:
-            case LLM_ARCH_REFACT:
-            case LLM_ARCH_MINICPM:
-            case LLM_ARCH_GRANITE:
-            case LLM_ARCH_GRANITE_MOE:
-            case LLM_ARCH_MISTRAL3:
-            case LLM_ARCH_LLAMA_EMBED:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
-
-                        // optional bias tensors
-                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
-                        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
-                            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-                            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-                        }
-                        else {
-                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-                        }
-
-                        if (n_expert == 0) {
-                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-
-                            // optional MLP bias
-                            layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
-                            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
-                            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
-                        } else {
-                            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
-                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, TENSOR_NOT_REQUIRED);
-                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
-                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
-
-                            // For Granite MoE Shared
-                            if (hparams.n_ff_shexp > 0) {
-                                layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
-                                layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
-                                layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
-                            }
-                        }
-                    }
-                } break;
-            case LLM_ARCH_LLADA:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
-                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
-
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output =
-                            create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
-
-                        // Use separate Q, K, V projections without bias, matching LLaDALlamaBlock
-                        layer.wq =
-                            create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
-                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
-                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
-                        // No bias for QKV projections as per config: include_bias=false, include_qkv_bias=false
-                        layer.wo =
-                            create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
-                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
-
-                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot / 2 },
-                                                         TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
-
-                        // optional MLP bias
-                        layer.ffn_gate_b =
-                            create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
-                        layer.ffn_down_b =
-                            create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
-                        layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
-                    }
-                }
-                break;
-            case LLM_ARCH_LLADA_MOE:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
-
-                    GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for llada-moe");
-                    GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for llada-moe");
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
-                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
-
-                        const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
-
-                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
-                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
-                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
-                    }
-                } break;
-            case LLM_ARCH_LLAMA4:
-                {
-                    if (n_expert == 0) {
-                        throw std::runtime_error(arch_name() + " model cannot have zero experts");
-                    }
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        const bool is_moe_layer = hparams.n_moe_layer_step > 0 && (i + 1) % hparams.n_moe_layer_step == 0;
-
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-
-                        if (is_moe_layer) {
-                            const int64_t n_ff_exp = hparams.n_ff_exp;
-
-                            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
-                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff_exp, n_expert}, 0);
-                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff_exp, n_embd, n_expert}, 0);
-                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff_exp, n_expert}, 0);
-
-                            // Shared expert
-                            const int64_t n_ff_shexp = n_ff_exp;
-                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {    n_embd, n_ff_shexp}, 0);
-                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd    }, 0);
-                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {    n_embd, n_ff_shexp}, 0);
-                        } else {
-                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                        }
-                    }
-                } break;
-            case LLM_ARCH_DECI:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-                        const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa(i);
-                        const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa(i);
-                        const int64_t n_ff          = hparams.n_ff(i);
-                        const int64_t n_head        = hparams.n_head(i);
-                        const int64_t n_head_kv     = hparams.n_head_kv(i);
-
-                        if (n_head_kv == 0 && n_head > 0) {
-                            // linear attention for DeciLMCausalModel
-                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-                        }
-                        else if (n_head_kv > 0) {
-                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                            create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
-                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
-                        }
-
-                        // optional bias tensors
-                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
-
-                        if (n_ff > 0) {
-                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        }
-
-                        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
-                            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-                            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-                        }
-                        else {
-                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-                        }
-
-                        if (n_ff > 0) {
-                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                        }
-
-                        // optional MLP bias
-                        layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
-                    }
-                } break;
-            case LLM_ARCH_MINICPM3:
-                {
-                    const int64_t n_embd_head_qk_rope = hparams.n_rot();
-                    const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot();
-
-                    const int64_t q_lora_rank  = hparams.n_lora_q;
-                    const int64_t kv_lora_rank = hparams.n_lora_kv;
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                        layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
-
-                        layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
-
-                        layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
-                        layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
-
-                        layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
-                        layer.wkv_b     = create_tensor(tn(LLM_TENSOR_ATTN_KV_B,     "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
-                        layer.wo        = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {              n_head * (                      n_embd_head_v), n_embd}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-
-                        layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-                        layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-                    }
-                } break;
-            case LLM_ARCH_GROK:
-                {
-                    if (n_expert == 0) {
-                        throw std::runtime_error(arch_name() + " model cannot have zero experts");
-                    }
-
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff/* / n_expert_used*/; // grok-1 n_ff_exp == n_ff
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
-                        layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff,   n_embd}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
-
-                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
-                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd,   n_expert}, 0);
-                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff_exp, n_expert}, 0);
-
-                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
-                        if (!layer.ffn_post_norm) {
-                            layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
-                        }
-                    }
-                } break;
-            case LLM_ARCH_DBRX:
-                {
-                    if (n_expert == 0) {
-                        throw std::runtime_error("DBRX model cannot have zero experts");
-                    }
-
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
-                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
-                        layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
-                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff,   n_expert}, 0);
-                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert}, 0);
-                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff,   n_expert}, 0);
-                    }
-                } break;
-            case LLM_ARCH_BAICHUAN:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-                    {
-                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                        output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                    }
-                } break;
-            case LLM_ARCH_FALCON:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    {
-                        output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                        output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
-
-                        output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                        if (!output) {
-                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
-                        }
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
-
-                        layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
-                        layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
-
-                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
-                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                    }
-                } break;
-            case LLM_ARCH_STARCODER:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-                    pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,   "weight"), {n_embd, n_ctx_train}, 0);
-
-                    // output
-                    {
-                        output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                        output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
-                        output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                        if (!output) {
-                            // needs to be on GPU
-                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                        }
-
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
-
-                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
-                        layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
-
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
-
-                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
-
-                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
-                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
-
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i),   {n_embd, n_ff}, 0);
-                        layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i),     {n_ff}, 0);
-                    }
-                } break;
-            case LLM_ARCH_BERT:
-            case LLM_ARCH_NOMIC_BERT:
-            case LLM_ARCH_NOMIC_BERT_MOE:
-            case LLM_ARCH_JINA_BERT_V3:
-                {
-                    if (n_token_types == 0) {
-                        throw std::runtime_error(arch_name() + " model needs to define token type count");
-                    }
-                    tok_embd     = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0);
-                    type_embd    = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
-
-                    if (arch == LLM_ARCH_BERT) {
-                        pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,    "weight"), {n_embd, n_ctx_train}, 0);
-
-                        cls   = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
-                        cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"),   {n_embd},         TENSOR_NOT_REQUIRED);
-
-                        cls_out   = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
-                        cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"),   {hparams.n_cls_out},         TENSOR_NOT_REQUIRED);
-                    }
-
-                    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight", 0), {n_embd}, 0);
-                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias",   0), {n_embd}, 0);
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
-
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
-
-                        layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
-                        layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i),   {n_embd}, 0);
-
-                        if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
-                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff,   n_expert}, 0);
-                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff,   n_embd, n_expert}, 0);
-                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,   "weight", i), {n_embd, n_expert}, 0);
-                        } else {
-                            layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
-                            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, TENSOR_NOT_REQUIRED);
-                            layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
-                            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
-
-                            if (arch == LLM_ARCH_NOMIC_BERT) {
-                                layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
-                            }
-                        }
-
-                        layer.layer_out_norm   = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
-                        layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i),   {n_embd}, 0);
-                    }
-                } break;
-            case LLM_ARCH_MODERN_BERT:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-                    tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight", 0), {n_embd}, 0);
-
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-
-                    for(int i = 0; i < n_layer; ++i) {
-                        auto& layer = layers[i];
-
-                        if ( i != 0 ) {
-                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                        } else{
-                            // layer 0 uses identity
-                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
-                        }
-
-
-                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3 * n_embd }, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT,   "weight", i), {n_embd, n_embd}, 0);
-
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, 2 * n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                    }
-
-                    cls_out   = create_tensor(tn(LLM_TENSOR_CLS_OUT,  "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
-                    cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT,  "bias"),   {hparams.n_cls_out},         TENSOR_NOT_REQUIRED);
-                    cls       = create_tensor(tn(LLM_TENSOR_CLS,      "weight"), {n_embd, n_embd},            TENSOR_NOT_REQUIRED);
-                    cls_norm  = create_tensor(tn(LLM_TENSOR_CLS_NORM, "weight"), {n_embd},                    TENSOR_NOT_REQUIRED);
-
-                } break;
-            case LLM_ARCH_NEO_BERT:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0);
-
-                    cls   = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
-                    cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"),   {n_embd},         TENSOR_NOT_REQUIRED);
-
-                    cls_out   = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
-                    cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"),   {hparams.n_cls_out},         TENSOR_NOT_REQUIRED);
-
-                    output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
-                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+    return it->second;
+}
 
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+float llama_model::get_rope_freq_base (const llama_cparams & cparams, int il) const {
+    return hparams.is_swa(il) ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
+}
 
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff*2}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
-                    }
-                } break;
-            case LLM_ARCH_EUROBERT:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) const {
+    return hparams.is_swa(il) ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
+}
 
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const {
+    const uint32_t n_ctx_seq = cparams.n_ctx_seq;
 
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
-                    }
-                } break;
-            case LLM_ARCH_JINA_BERT_V2:
-                {
-                    tok_embd  = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0); // word_embeddings
-                    type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); // token_type_embeddings
-
-                    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight", 0), {n_embd}, 0); // LayerNorm
-                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias",   0), {n_embd}, 0); // LayerNorm bias
-
-                    cls   = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
-                    cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"),   {1},         TENSOR_NOT_REQUIRED);
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i]; // JinaBertLayer
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
-
-                        layer.attn_q_norm   = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
-                        layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
-
-                        layer.attn_k_norm   = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
-                        layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
-
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); //output_dens
-                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); //output_dens
-
-                        layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); //output_norm
-                        layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias",   i), {n_embd}, 0);
-
-                        layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
-                        layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
-
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
-
-                        const auto tn_ffn_up_weight = tn(LLM_TENSOR_FFN_UP, "weight", i);
-                        ggml_tensor * t_ffn_up = ml.get_tensor_meta(tn_ffn_up_weight.str().c_str());
-                        const int64_t n_ffn_up = t_ffn_up ? t_ffn_up->ne[1] : n_ff;
-
-                        GGML_ASSERT(n_ffn_up == n_ff || n_ffn_up == n_ff * 2);
-                        layer.ffn_up   = create_tensor(tn_ffn_up_weight, {n_embd, n_ffn_up}, 0);
-                        layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ffn_up}, TENSOR_NOT_REQUIRED);
-
-                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
-                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias",   i), {n_embd}, 0);
-
-                        layer.layer_out_norm   = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
-                        layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias",   i), {n_embd}, 0);
-                    }
-                } break;
-            case LLM_ARCH_BLOOM:
-                {
-                    tok_embd   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab}, 0);
-                    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight", 0), {n_embd}, 0);
-                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias",   0), {n_embd}, 0);
-
-                    // output
-                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
-                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias",   i), {n_embd}, 0);
-
-                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
-                        layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
-
-                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), {n_embd}, 0);
-
-                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias",   i), {n_embd}, 0);
-
-                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
-                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias",   i), {n_embd}, 0);
-
-                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
-                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias",   i), {n_ff}, 0);
-                    }
-                } break;
-            case LLM_ARCH_MPT:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-                    pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,   "weight"), {n_embd, n_ctx_train}, TENSOR_NOT_REQUIRED);
-
-                    // output
-                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, TENSOR_NOT_REQUIRED);
-
-                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    if (!output) {
-                        output    = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
-
-                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
-                        layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
-
-                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
-
-                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
-
-                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
-                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
-
-                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, TENSOR_NOT_REQUIRED);
-
-                        // FIXME test-llama-archs crashes if q_norm is created
-                        layer.attn_q_norm   = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
-                        layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
-
-                        layer.attn_k_norm   = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
-                        layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
-
-                        // AWQ ScaleActivation layer
-                        layer.ffn_act = create_tensor(tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, TENSOR_NOT_REQUIRED);
-                    }
-                } break;
-            case LLM_ARCH_STABLELM:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
-                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm =   create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
-                        // optional q and k layernorms, present in StableLM 2 12B
-                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head},    TENSOR_NOT_REQUIRED);
-                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
-
-                        // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
-                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
-
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                    }
-                } break;
-            case LLM_ARCH_QWEN:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}, 0);
-                        layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}, 0);
-                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff/2}, 0);
-                    }
-                } break;
-            case LLM_ARCH_QWEN2:
-            case LLM_ARCH_QWEN2VL:
-            case LLM_ARCH_DREAM:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    output_b    = create_tensor(tn(LLM_TENSOR_OUTPUT,      "bias"),   {n_vocab}, TENSOR_NOT_REQUIRED);
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                    }
-                } break;
-            case LLM_ARCH_QWEN2MOE:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
-
-                        if (n_expert == 0) {
-                            throw std::runtime_error("n_expert must be > 0 for QWEN2MOE");
-                        }
-                        if (n_expert_used == 0) {
-                            throw std::runtime_error("n_expert_used must be > 0 for QWEN2MOE");
-                        }
-
-                        // MoE branch
-                        const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
-
-                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
-                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
-                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
-
-                        // Shared expert branch
-                        const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
-
-                        layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd}, 0);
-                        layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {    n_embd, n_ff_shexp}, 0);
-                        layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp,     n_embd}, 0);
-                        layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {    n_embd, n_ff_shexp}, 0);
-                    }
-                } break;
-            case LLM_ARCH_QWEN3:
-            case LLM_ARCH_QWEN3VL:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    // output rerank head
-                    cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_gqa, n_embd_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
-
-                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
-                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                    }
-                } break;
-            case LLM_ARCH_QWEN3MOE:
-            case LLM_ARCH_QWEN3VLMOE:
-            case LLM_ARCH_RND1:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_gqa, n_embd_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
-
-                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
-                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
-
-                        if (n_expert == 0) {
-                            throw std::runtime_error("n_expert must be > 0 for QWEN3MOE");
-                        }
-                        if (n_expert_used == 0) {
-                            throw std::runtime_error("n_expert_used must be > 0 for QWEN3MOE");
-                        }
-
-                        // MoE branch
-                        const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
-
-                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
-                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
-                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
-                    }
-                } break;
-            case LLM_ARCH_PHI2:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
-                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
-                    output_b      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "bias"),   {n_vocab}, 0);
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
-
-                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
-
-                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
-                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
-
-                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
-                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);
-                    }
-                } break;
-            case LLM_ARCH_PHI3:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, TENSOR_NOT_REQUIRED);
-                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
-
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
-                        layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
-
-                        layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-                        layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-                    }
-                } break;
-            case LLM_ARCH_PHIMOE:
-                {
-                    const int64_t n_embd_head = n_embd / n_head;
-
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
-
-                    // output
-                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
-                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
-                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), { n_embd, n_vocab }, 0);
-                    output_b      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "bias"),   { n_vocab }, 0);
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
-                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias",   i), { n_embd }, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
-                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
-                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), { n_embd }, 0);
-
-                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
-                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias",   i), { n_embd }, 0);
-
-                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert},         0);
-                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff,   n_expert}, 0);
-                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert}, 0);
-                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff,   n_expert}, 0);
-
-                        layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-                        layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-                     }
-                } break;
-            case LLM_ARCH_PLAMO:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                    }
-                } break;
-            case LLM_ARCH_PLAMO2:
-                {
-                    // mamba parameters
-                    const uint32_t d_conv             = hparams.ssm_d_conv;
-                    const uint32_t d_state            = hparams.ssm_d_state;
-                    const uint32_t num_heads          = hparams.ssm_dt_rank;
-                    const uint32_t intermediate_size  = hparams.ssm_d_inner;
-                    const int64_t dt_dim              = std::max(64, int(hparams.n_embd / 16));
-
-                    // attention parameters
-                    const uint32_t qk_dim = hparams.n_embd_head_k();
-                    const uint32_t v_dim  = hparams.n_embd_head_v();
-
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-                        bool is_mamba_layer = hparams.is_recurrent(i);
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        if (is_mamba_layer) {
-                            layer.ssm_in       = create_tensor(tn(LLM_TENSOR_SSM_IN,     "weight", i), {n_embd, 2 * intermediate_size}, 0);
-                            layer.ssm_conv1d   = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, intermediate_size}, 0);
-
-                            layer.ssm_x    = create_tensor(tn(LLM_TENSOR_SSM_X,  "weight", i), {intermediate_size, dt_dim + 2*d_state}, 0);
-                            layer.ssm_dt   = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_dim, num_heads}, 0);
-                            layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {num_heads}, 0);
-
-                            layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {num_heads}, 0);
-                            layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {num_heads}, 0);
-
-                            layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {intermediate_size, n_embd}, 0);
-
-                            layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, i), {dt_dim}, 0);
-                            layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, i), {d_state}, 0);
-                            layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, i), {d_state}, 0);
-                        } else {
-                            const int64_t num_attention_heads = hparams.n_head(i);
-                            const int64_t q_num_heads         = num_attention_heads;
-                            const int64_t num_key_value_heads = hparams.n_head_kv(i);
-                            const int64_t k_num_heads         = num_key_value_heads;
-                            const int64_t v_num_heads         = num_key_value_heads;
-                            const int64_t q_proj_dim          = q_num_heads * qk_dim;
-                            const int64_t k_proj_dim          = k_num_heads * qk_dim;
-                            const int64_t v_proj_dim          = v_num_heads * v_dim;
-
-                            layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, q_proj_dim + k_proj_dim + v_proj_dim}, 0);
-                            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {qk_dim, num_attention_heads}, 0);
-                            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {qk_dim, k_num_heads}, 0);
-                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {q_num_heads * v_dim, n_embd}, 0);
-                        }
-
-                        // All layers have post-attention norm, FFN norm, and FFN tensors
-                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0);
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff * 2}, 0);
-                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
-                    }
-                } break;
-            case LLM_ARCH_PLAMO3:
-                {
-                    const int64_t head_dim_q = hparams.n_embd_head_k();
-                    const int64_t head_dim_v = hparams.n_embd_head_v();
-
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        const int64_t num_attention_heads = hparams.n_head(i);
-                        const int64_t num_key_value_heads = hparams.n_head_kv(i);
-                        const int64_t q_proj_dim = num_attention_heads * head_dim_q;
-                        const int64_t k_proj_dim = num_key_value_heads * head_dim_q;
-                        const int64_t v_proj_dim = num_key_value_heads * head_dim_v;
-                        const int64_t n_ff_cur   = hparams.n_ff(i);
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i),
-                                {n_embd,q_proj_dim + k_proj_dim + v_proj_dim}, 0);
-                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {head_dim_q}, 0);
-                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {head_dim_q}, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {num_attention_heads * head_dim_v, n_embd}, 0);
-                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
-
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff_cur * 2}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff_cur, n_embd}, 0);
-                    }
-                } break;
-            case LLM_ARCH_GPT2:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-                    pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,   "weight"), {n_embd, n_ctx_train}, 0);
-
-                    // output
-                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
-                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, 0);
-                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, 0);
-
-                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
-                        layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
-
-                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
-
-                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
-
-                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
-                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
-
-                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
-                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);
-                    }
-                } break;
-            case LLM_ARCH_CODESHELL:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-
-                    // if tok embd is NULL, init from output
-                    if (tok_embd == NULL) {
-                        tok_embd = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    // output
-                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
-                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
-
-                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
-
-                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
-
-                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
-                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
-
-                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i),   {n_embd, n_ff}, 0);
-                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i),     {n_ff}, 0);
-                    }
-                } break;
-            case LLM_ARCH_ORION:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
-                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
-                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
-
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                    }
-                } break;
-            case LLM_ARCH_INTERNLM2:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                        // layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
-                        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
-
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                    }
-                } break;
-            case LLM_ARCH_GEMMA:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                    }
-                } break;
-            case LLM_ARCH_GEMMA2:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
-                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
-                    }
-                } break;
-            case LLM_ARCH_GEMMA3:
-            case LLM_ARCH_GEMMA_EMBEDDING:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,   "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    // Dense linear weights
-                    dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.dense_2_feat_out}, TENSOR_NOT_REQUIRED);
-                    dense_3_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_3_OUT, "weight"), {hparams.dense_3_feat_in, n_embd}, TENSOR_NOT_REQUIRED);
-
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
-
-                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
-                        layer.attn_k_norm    = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM,    "weight", i), {n_embd_head_k}, 0);
-                        layer.attn_q_norm    = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM,    "weight", i), {n_embd_head_k}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
-                    }
-                } break;
-            case LLM_ARCH_GEMMA3N:
-                {
-                    const int64_t n_altup      = hparams.n_altup;
-                    const int64_t laurel_rank  = hparams.laurel_rank;
-                    const int64_t n_embd_altup = hparams.n_embd_altup;
-
-                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    altup_proj        = create_tensor(tn(LLM_TENSOR_ALTUP_PROJ,        "weight"), {n_embd, n_embd, n_altup - 1}, 0);
-                    altup_unembd_proj = create_tensor(tn(LLM_TENSOR_ALTUP_UNEMBD_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0);
-
-                    per_layer_tok_embd   = create_tensor(tn(LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "weight"), {n_embd_altup * n_layer, n_vocab}, 0);
-                    per_layer_model_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_MODEL_PROJ, "weight", 0), {n_embd, n_embd_altup * n_layer}, 0);
-                    per_layer_proj_norm  = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ_NORM,  "weight", 0), {n_embd_altup}, 0);
-
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
-
-                        layer.attn_q_norm    = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM,    "weight", i), {n_embd_head_k}, 0);
-                        layer.attn_k_norm    = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM,    "weight", i), {n_embd_head_k}, 0);
-                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
-
-                        // altup & laurel
-                        layer.per_layer_inp_gate   = create_tensor(tn(LLM_TENSOR_PER_LAYER_INP_GATE,  "weight", i), {n_embd, n_embd_altup}, 0);
-                        layer.per_layer_proj       = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ,      "weight", i), {n_embd_altup, n_embd}, 0);
-                        layer.per_layer_post_norm  = create_tensor(tn(LLM_TENSOR_PER_LAYER_POST_NORM, "weight", i), {n_embd}, 0);
-                        layer.altup_correct_coef   = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_COEF,  "weight", i), {n_altup, n_altup}, 0);
-                        layer.altup_correct_scale  = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_SCALE, "weight", i), {n_embd}, 0);
-                        layer.altup_predict_coef   = create_tensor(tn(LLM_TENSOR_ALTUP_PREDICT_COEF,  "weight", i), {n_altup, n_altup * n_altup}, 0);
-                        layer.altup_router         = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER,        "weight", i), {n_embd, n_altup}, 0);
-                        layer.altup_router_norm    = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER_NORM,   "weight", i), {n_embd}, 0);
-                        layer.laurel_l             = create_tensor(tn(LLM_TENSOR_LAUREL_L,            "weight", i), {n_embd, laurel_rank}, 0);
-                        layer.laurel_r             = create_tensor(tn(LLM_TENSOR_LAUREL_R,            "weight", i), {laurel_rank, n_embd}, 0);
-                        layer.laurel_post_norm     = create_tensor(tn(LLM_TENSOR_LAUREL_POST_NORM,    "weight", i), {n_embd}, 0);
-                    }
-                } break;
-            case LLM_ARCH_GEMMA4:
-                {
-                    const uint32_t n_embd_per_layer = hparams.n_embd_per_layer;
-                    const int64_t  n_ff_exp         = hparams.n_ff_exp;
-
-                    if (n_embd_head_k != n_embd_head_v) {
-                        throw std::runtime_error("Gemma 4 requires n_embd_head_k == n_embd_head_v");
-                    }
-                    if (hparams.n_embd_head_k_swa != hparams.n_embd_head_v_swa) {
-                        throw std::runtime_error("Gemma 4 requires n_embd_head_k_swa == n_embd_head_v_swa");
-                    }
-
-                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    if (n_embd_per_layer > 0) {
-                        per_layer_tok_embd   = create_tensor(tn(LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "weight"),    {n_embd_per_layer * n_layer, n_vocab}, 0);
-                        per_layer_model_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_MODEL_PROJ, "weight", 0), {n_embd, n_embd_per_layer * n_layer}, 0);
-                        per_layer_proj_norm  = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ_NORM,  "weight", 0), {n_embd_per_layer}, 0);
-                    }
-
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-
-                    int rope_freqs_flag = 0;
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-                        const int64_t n_head      = hparams.n_head(i);
-                        const int64_t n_embd_head = hparams.n_embd_head_k(i);
-                        const int64_t n_embd_k    = hparams.n_embd_k_gqa(i);
-                        const int64_t n_embd_v    = hparams.n_embd_v_gqa(i);
-                        const int     kv_flags    = hparams.has_kv(i) ? 0 : TENSOR_NOT_REQUIRED;
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        // note: use_alternative_attention (v_proj is optional, if it's not present, use k_proj)
-                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head * n_head}, 0);
-                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k}, kv_flags);
-                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v}, TENSOR_NOT_REQUIRED);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head * n_head, n_embd}, 0);
-
-                        layer.attn_q_norm    = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM,    "weight", i), {n_embd_head}, 0);
-                        layer.attn_k_norm    = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM,    "weight", i), {n_embd_head}, kv_flags);
-                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.out_scale = create_tensor(tn(LLM_TENSOR_LAYER_OUT_SCALE, "weight", i), {1u}, TENSOR_NOT_REQUIRED);
-
-                        if (!hparams.is_swa(i)) {
-                            // full_attention layers use rope_freqs for proportional rope
-                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_embd_head/2}, rope_freqs_flag);
-                            rope_freqs_flag = TENSOR_DUPLICATED;
-                        }
-
-                        // handle use_double_wide_mlp
-                        int64_t n_ff_cur = hparams.n_ff(i);
-
-                        // for expert layers, we use normal FFN as shared expert (same as python code)
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff_cur}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff_cur}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff_cur, n_embd}, 0);
-                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
-
-                        // MoE router
-                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
-                        bool has_expert = layer.ffn_gate_inp != nullptr;
-
-                        // norm
-                        if (has_expert) {
-                            layer.ffn_gate_inp_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "scale", i), {n_embd}, 0);
-
-                            layer.ffn_pre_norm_2  = create_tensor(tn(LLM_TENSOR_FFN_PRE_NORM_2,  "weight", i), {n_embd}, 0);
-                            layer.ffn_post_norm_1 = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM_1, "weight", i), {n_embd}, 0);
-                            layer.ffn_post_norm_2 = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM_2, "weight", i), {n_embd}, 0);
-
-                            // MoE FFN
-                            layer.ffn_gate_up_exps  = create_tensor(tn(LLM_TENSOR_FFN_GATE_UP_EXPS,  "weight", i), {n_embd, n_ff_exp * 2, n_expert}, 0);
-                            layer.ffn_down_exps     = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS,     "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
-
-                            // per-expert scale will be loaded as down_exps_s at the end of the current switch case
-                        }
-
-                        // per-layer embeddings
-                        if (n_embd_per_layer > 0) {
-                            layer.per_layer_inp_gate   = create_tensor(tn(LLM_TENSOR_PER_LAYER_INP_GATE,  "weight", i), {n_embd, n_embd_per_layer}, 0);
-                            layer.per_layer_proj       = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ,      "weight", i), {n_embd_per_layer, n_embd}, 0);
-                            layer.per_layer_post_norm  = create_tensor(tn(LLM_TENSOR_PER_LAYER_POST_NORM, "weight", i), {n_embd}, 0);
-                        }
-                    }
-                } break;
-            case LLM_ARCH_STARCODER2:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
-
-                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
-                        // optional bias tensors
-                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
-
-                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
-
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-
-                        // optional bias tensors
-                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
-                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP ,  "bias", i), {  n_ff}, 0);
-                    }
-                } break;
-            case LLM_ARCH_MAMBA:
-                {
-                    const int64_t d_conv  = hparams.ssm_d_conv;
-                    const int64_t d_inner = hparams.ssm_d_inner;
-                    const int64_t d_state = hparams.ssm_d_state;
-                    const int64_t dt_rank = hparams.ssm_dt_rank;
-
-                    // only an expansion factor of 2 is supported for now
-                    if (2 * n_embd != d_inner) {
-                        throw std::runtime_error("only an expansion factor of 2 is supported for now");
-                    }
-
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-
-                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    // if output is NULL, init from the input tok embed, duplicated to allow offloading
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        // norm
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
-
-                        layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
-                        layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
-
-                        layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
-
-                        layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
-                        layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
-
-                        // no "weight" suffix for these
-                        layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
-                        layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
-
-                        // out_proj
-                        layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
-                    }
-                } break;
-            case LLM_ARCH_MAMBA2:
-                {
-                    const int64_t d_conv  = hparams.ssm_d_conv;
-                    const int64_t d_inner = hparams.ssm_d_inner;
-                    const int64_t d_state = hparams.ssm_d_state;
-                    const int64_t n_head  = hparams.ssm_dt_rank;
-                    const int64_t n_group = hparams.ssm_n_group;
-                    const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_head;
-
-                    // only an expansion factor of 2 is supported for now
-                    GGML_ASSERT(2 * n_embd == d_inner);
-
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    {
-                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-
-                        output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                        // if output is NULL, init from the input tok embed, duplicated to allow offloading
-                        if (output == NULL) {
-                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                        }
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        // norm
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
-
-                        layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
-                        layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, 0);
-
-                        layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_head}, 0);
-
-                        // no "weight" suffix for these
-                        layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0);
-                        layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_head}, 0);
-
-                        layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
-
-                        // out_proj
-                        layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
-                    }
-                } break;
-            case LLM_ARCH_JAMBA:
-                {
-                    const int64_t d_conv  = hparams.ssm_d_conv;
-                    const int64_t d_inner = hparams.ssm_d_inner;
-                    const int64_t d_state = hparams.ssm_d_state;
-                    const int64_t dt_rank = hparams.ssm_dt_rank;
-
-                    // only an expansion factor of 2 is supported for now
-                    GGML_ASSERT(2 * n_embd == d_inner);
-
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    {
-                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-
-                        output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                        // if output is NULL, init from the input tok embed, duplicated to allow offloading
-                        if (output == NULL) {
-                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                        }
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        const int64_t n_head_kv = hparams.n_head_kv(i);
-                        const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
-
-                        auto & layer = layers[i];
-
-                        // norm
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        if (n_head_kv == 0) {
-                            // Mamba layer
-                            layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
-
-                            layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
-                            layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
-
-                            layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
-
-                            layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, "weight", i), {dt_rank}, 0);
-
-                            layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
-                            layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
-
-                            layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, "weight", i), {d_state}, 0);
-                            layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, "weight", i), {d_state}, 0);
-
-                            // no "weight" suffix for these
-                            layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
-                            layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
-
-                            // out_proj
-                            layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
-                        } else {
-                            // Attention layers
-
-                            create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
-                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-                        }
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
-
-                        if (layer.ffn_gate_inp) {
-                            // MoE
-                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
-                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
-                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff, n_expert}, 0);
-                        } else {
-                            // FFN (no MoE)
-                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
-                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
-                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
-                        }
-                    }
-                } break;
-            case LLM_ARCH_GRANITE_HYBRID:
-                {
-                    // mamba2 Mixer SSM params
-                    // NOTE: int64_t for tensor dimensions
-                    const int64_t d_conv     = hparams.ssm_d_conv;
-                    const int64_t d_inner    = hparams.ssm_d_inner;
-                    const int64_t d_state    = hparams.ssm_d_state;
-                    const int64_t n_ssm_head = hparams.ssm_dt_rank;
-                    const int64_t n_group    = hparams.ssm_n_group;
-                    const int64_t d_in_proj  = 2*d_inner + 2*n_group*d_state + n_ssm_head;
-
-                    // only an expansion factor of 2 is supported for now
-                    GGML_ASSERT(2 * n_embd == d_inner);
-
-                    // embeddings
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    {
-                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                        output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                        // if output is NULL, init from the input tok embed, duplicated to allow offloading
-                        if (output == NULL) {
-                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                        }
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        // norm
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        if (hparams.is_recurrent(i)) {
-                            // ssm layers
-                            layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
-
-                            layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
-                            layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
-
-                            layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
-
-                            // no "weight" suffix for these
-                            layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
-                            layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
-
-                            layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
-
-                            // out_proj
-                            layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
-                        } else {
-                            // attention layers (with optional bias)
-                            const int64_t n_head_i = hparams.n_head(i);
-                            const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
-                            const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
-                            create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head_i, n_embd_k_gqa_i, n_embd_v_gqa_i, 0);
-                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
-                            layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
-                        }
-
-                        // feed forward (w/ optional biases)
-                        if (n_expert > 0) {
-                            // MoE FFN
-                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-                            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
-                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, TENSOR_NOT_REQUIRED);
-                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
-                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
-
-                            // For Granite MoE Shared
-                            if (hparams.n_ff_shexp > 0) {
-                                layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
-                                layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
-                                layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
-                            }
-                        } else {
-                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                            layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
-                            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
-                            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
-                        }
-                    }
-                } break;
-            case LLM_ARCH_XVERSE:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                    }
-                } break;
-            case LLM_ARCH_COMMAND_R:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    // init output from the input tok embed
-                    output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        if (n_layer >= 64){
-                            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
-                            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
-                        }
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                    }
-                } break;
-            case LLM_ARCH_COHERE2:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
-                    // init output from the input tok embed
-                    output      = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab },
-                                                      TENSOR_DUPLICATED);
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
-
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
-                    }
-                }
-                break;
-            case LLM_ARCH_OLMO:  // adapted from LLM_ARCH_LLAMA with norm params removed
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                    }
-                } break;
-            case LLM_ARCH_OLMO2:
-                {
-                    const int64_t n_embd_head = n_embd / n_head;
-
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
-                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_head_kv * n_embd_head}, 0);
-                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
-                    }
-                } break;
-            case LLM_ARCH_SEED_OSS:
-                {
-                    const uint32_t head_dim             = hparams.n_embd_head_k();
-                    const int64_t n_qo_dim              = n_head * head_dim;
-                    const int64_t n_kv_dim              = n_head_kv * head_dim;
-
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        create_tensor_qkv(layer, i, n_embd, n_qo_dim, n_kv_dim, n_kv_dim, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, 0);
-
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                    }
-                } break;
-
-            case LLM_ARCH_OLMOE:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
-                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
-
-                        if (n_expert == 0) {
-                            throw std::runtime_error("n_expert must be > 0");
-                        }
-                        if (n_expert_used == 0) {
-                            throw std::runtime_error("n_expert_used must be > 0");
-                        }
-
-                        // MoE branch
-                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff,   n_expert}, 0);
-                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert}, 0);
-                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff,   n_expert}, 0);
-                    }
-                } break;
-            case LLM_ARCH_OPENELM:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    // init output from the input tok embed
-                    output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        const int64_t n_head      =   hparams.n_head(i);
-                        const int64_t n_head_qkv  = 2*hparams.n_head_kv(i) + n_head;
-                        const int64_t n_ff        =   hparams.n_ff(i);
-
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k}, 0);
-                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
-                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
-                    }
-                } break;
-            case LLM_ARCH_GPTNEOX:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
-                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
-
-                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
-                        layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
-
-                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
-
-                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
-
-                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
-                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
-
-                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
-                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);
-                    }
-                } break;
-            case LLM_ARCH_ARCTIC:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_embd}, 0);
-
-                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
-                        layer.ffn_norm_exps = create_tensor(tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd}, 0);
-                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, false);
-                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
-                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
-                    }
-                } break;
-            case LLM_ARCH_DEEPSEEK:
-                {
-
-                    const int64_t n_ff_exp        = hparams.n_ff_exp;
-                    const int64_t n_expert_shared = hparams.n_expert_shared;
-
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    // try to load output.weight, if not found, use token_embd (tied embeddings)
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    if (!output) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
-                        if (i < (int) hparams.n_layer_dense_lead) {
-                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                        } else {
-                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
-
-                            if (n_expert == 0) {
-                                throw std::runtime_error("n_expert must be > 0");
-                            }
-                            if (n_expert_used == 0) {
-                                throw std::runtime_error("n_expert_used must be > 0");
-                            }
-
-                            // MoE branch
-                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
-                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
-                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
-
-                            // Shared expert branch
-                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
-                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, 0);
-                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
-                        }
-                    }
-                } break;
-            case LLM_ARCH_DEEPSEEK2:
-            case LLM_ARCH_MISTRAL4:
-                {
-                    const bool is_mla = hparams.is_mla();
-
-                    // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
-                    const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
-                    const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
-
-                    const int64_t n_embd_head_qk_rope = hparams.n_rot();
-                    const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
-                    GGML_ASSERT(n_embd_head_qk_nope >= 1);
-
-                    const int64_t q_lora_rank  = hparams.n_lora_q;
-                    const int64_t kv_lora_rank = hparams.n_lora_kv;
-
-                    const int64_t n_ff_exp        = hparams.n_ff_exp;
-                    const int64_t n_expert_shared = hparams.n_expert_shared;
-
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    // try to load output.weight, if not found, use token_embd (tied embeddings)
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    if (!output) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                        if (q_lora_rank > 0) {
-                            layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
-                        }
-
-                        layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
-
-                        if (q_lora_rank > 0) {
-                            layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
-                            layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
-                        } else {
-                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
-                        }
-
-                        layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, 0);
-
-                        // note: only old legacy GGUF files will have the unsplit wkv_b tensor in
-                        if (is_mla) {
-                            layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, 0);
-                            layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
-                        } else {
-                            layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v_mla)}, 0);
-                        }
-
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
-                        if (i < (int) hparams.n_layer_dense_lead) {
-                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                        } else {
-                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
-                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
-
-                            if (n_expert == 0) {
-                                throw std::runtime_error("n_expert must be > 0");
-                            }
-                            if (n_expert_used == 0) {
-                                throw std::runtime_error("n_expert_used must be > 0");
-                            }
-
-                            // MoE branch
-                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
-                            create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0);
-
-                            // Shared expert branch
-                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
-                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, 0);
-                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
-                        }
-                    }
-                } break;
-            case LLM_ARCH_DEEPSEEK2OCR:
-                {
-                    // similar to deepseek2, but without MLA
-                    const int64_t n_ff_exp        = hparams.n_ff_exp;
-                    const int64_t n_expert_shared = hparams.n_expert_shared;
-
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    // try to load output.weight, if not found, use token_embd (tied embeddings)
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    if (!output) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
-                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd}, 0);
-                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd}, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
-                        // norm
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        if (i < (int) hparams.n_layer_dense_lead) {
-                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                        } else {
-                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
-                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
-
-                            if (n_expert == 0) {
-                                throw std::runtime_error("n_expert must be > 0");
-                            }
-                            if (n_expert_used == 0) {
-                                throw std::runtime_error("n_expert_used must be > 0");
-                            }
-
-                            // MoE branch
-                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
-                            create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0);
-
-                            // Shared expert branch
-                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
-                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, 0);
-                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
-                        }
-                    }
-                } break;
-            case LLM_ARCH_PLM:
-                {
-                    const int64_t n_embd_head_qk_rope = hparams.n_rot();
-                    const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot();
-                    const int64_t kv_lora_rank = hparams.n_lora_kv;
-
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    // output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
-                    output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.wq        = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
-                        layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
-                        layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
-                        layer.wkv_b     = create_tensor(tn(LLM_TENSOR_ATTN_KV_B,     "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
-                        layer.wo        = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {              n_head * (                      n_embd_head_v), n_embd}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                    }
-                } break;
-            case LLM_ARCH_BITNET:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm     = create_tensor(tn(LLM_TENSOR_ATTN_NORM,     "weight", i), {n_embd}, 0);
-                        layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.wq       = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
-                        layer.wq_s     = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
-                        layer.wk       = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
-                        layer.wk_s     = create_tensor(tn(LLM_TENSOR_ATTN_K,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
-                        layer.wv       = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
-                        layer.wv_s     = create_tensor(tn(LLM_TENSOR_ATTN_V,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
-                        layer.wo       = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-                        layer.wo_s     = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale",  i), {1}, TENSOR_NOT_REQUIRED);
-
-                        layer.ffn_norm     = create_tensor(tn(LLM_TENSOR_FFN_NORM,     "weight", i), {n_embd}, 0);
-                        layer.ffn_sub_norm = create_tensor(tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}, 0);
-
-                        layer.ffn_gate       = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
-                        layer.ffn_gate_s = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale",  i), {1}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_down       = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
-                        layer.ffn_down_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale",  i), {1}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_up         = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
-                        layer.ffn_up_s   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
-                    }
-                } break;
-            case LLM_ARCH_T5:
-                {
-                    const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
-
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output_norm     = create_tensor(tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd}, 0);
-
-                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    // n_layer:     number of encoder_layers
-                    // dec_n_layer: number of decoder_layers
-                    const int dec_n_layer = hparams.dec_n_layer;
-                    if (dec_n_layer > n_layer) {
-                        layers.resize(dec_n_layer);
-                    }
-
-                    // load encoder layers
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm_enc  = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM,  "weight", i), {n_embd}, 0);
-                        layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
-
-                        layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
-                        layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
-                        layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
-                        layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
-
-                        layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_up_enc   = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                    }
-
-                    // load decoder layers
-                    for (int i = 0; i < dec_n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm  = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM,  "weight", i), {n_embd}, 0);
-                        layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
-
-                        layer.wq = create_tensor(tn(LLM_TENSOR_DEC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
-                        layer.wk = create_tensor(tn(LLM_TENSOR_DEC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
-                        layer.wv = create_tensor(tn(LLM_TENSOR_DEC_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
-
-                        layer.attn_norm_cross  = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM,  "weight", i), {n_embd}, 0);
-                        // this tensor seems to be unused in HF transformers implementation
-                        layer.attn_rel_b_cross = create_tensor(
-                            tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
-
-                        layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
-                        layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
-                        layer.wv_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
-                        layer.wo_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_DEC_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                    }
-                } break;
-            case LLM_ARCH_T5ENCODER:
-                {
-                    const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
-
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm_enc  = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM,  "weight", i), {n_embd}, 0);
-                        layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
-
-                        layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
-                        layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
-                        layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
-                        layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
-
-                        layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_up_enc   = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                    }
-                } break;
-            case LLM_ARCH_JAIS:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
-                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, 0);
-                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, 0);
-
-                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
-                        layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
-
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
-
-                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
-
-                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
-                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
-
-                        layer.ffn_gate   = create_tensor(tn(LLM_TENSOR_FFN_GATE,   "weight", i), {n_embd, n_ff}, 0);
-                        layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE,   "bias", i),   {n_ff}, 0);
-
-                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
-                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);
-                    }
-                } break;
-            case LLM_ARCH_JAIS2:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
-                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    if (!output) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
-
-                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
-                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
-                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
-
-                        // attention biases - all have shape n_embd (output dimension of projections)
-                        layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
-                        layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd}, 0);
-                        layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd}, 0);
-                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
-
-                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
-
-                        // Jais-2 uses simple MLP (no gate) with biases
-                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
-                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);
-                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
-                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
-                    }
-                } break;
-            case LLM_ARCH_CHATGLM:
-                {
-                    tok_embd   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
-
-                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
-                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff * 2}, 0);
-
-                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
-                    }
-                } break;
-            case LLM_ARCH_GLM4:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        int flags = 0;
-                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
-                            // skip all tensors in the NextN layers
-                            flags |= TENSOR_SKIP;
-                        }
-
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
-                        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, flags);
-
-                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, flags);
-
-                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, flags);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, flags);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff * 2}, flags);
-
-                        layer.ffn_post_norm  = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, flags);
-
-                        // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
-                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
-                            layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
-                            layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
-                            layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
-
-                            // Optional tensors
-                            layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
-                            layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
-                            layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED);
-                        }
-                    }
-                } break;
-            case LLM_ARCH_GLM4_MOE:
-                {
-                    const int64_t n_expert        = hparams.n_expert;
-                    const int64_t n_expert_used   = hparams.n_expert_used;
-                    const int64_t n_expert_shared = hparams.n_expert_shared;
-
-                    GGML_ASSERT(hparams.n_expert > 0 && "n_expert must be > 0 for GLM4_MOE MoE layers");
-                    GGML_ASSERT(hparams.n_expert_used > 0 && "n_expert_used must be > 0 for GLM4_MOE MoE layers");
-
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
-                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
-                    }
-
-                    // Load ALL tensors including NextN layer to satisfy total tensor count
-                    // but only PROCESS up to last layer (skipping final NextN layer) in forward pass
-                    for (int i = 0; i < n_layer; ++i) {
-                        int flags = 0;
-                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
-                            // skip all tensors in the NextN layers
-                            flags |= TENSOR_SKIP;
-                        }
-
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, flags);
-
-                        // GLM-style attention with bias terms
-                        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, flags);
-
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
-
-                        // K/Q norm tensors (optional for GLM-4.5 355B variant)
-                        layer.attn_q_norm = create_tensor(
-                            tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
-                        layer.attn_k_norm = create_tensor(
-                            tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
-
-                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags);
-
-                        // Check if this layer uses MoE or dense FFN based on n_layer_dense_lead
-                        // GLM 4.5 uses hybrid architecture: layer 0 is dense, layers 1+ are MoE
-                        const bool use_moe = (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead);
-
-                        if (use_moe) {
-                            // MoE layers
-                            layer.ffn_gate_inp =
-                                create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags);
-                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags);
-
-                            // MoE branch
-                            const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
-
-                            layer.ffn_gate_exps = create_tensor(
-                                tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
-                            layer.ffn_down_exps = create_tensor(
-                                tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags);
-                            layer.ffn_up_exps = create_tensor(
-                                tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
-
-                            // Shared expert
-                            if (n_expert_shared > 0) {
-                                const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
-                                layer.ffn_gate_shexp = create_tensor(
-                                    tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
-                                layer.ffn_down_shexp = create_tensor(
-                                    tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags);
-                                layer.ffn_up_shexp = create_tensor(
-                                    tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
-                            }
-                        } else {
-                            // Dense layers (first k layers) - GLM uses separate gate/up projections
-                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags);
-                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags);
-                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), { n_embd, n_ff }, flags);
-                        }
-
-                        // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
-                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
-                            layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
-                            layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
-                            layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
-
-                            // Optional tensors
-                            layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
-                            layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
-                            layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED);
-                        }
-                    }
-                }
-                break;
-            case LLM_ARCH_GLM_DSA:
-                {
-                    const bool is_mla = hparams.is_mla();
-                    if (!is_mla) {
-                        throw std::runtime_error("GLM_DSA architecture requires MLA");
-                    }
-
-                    // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
-                    const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
-                    const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
-
-                    const int64_t n_embd_head_qk_rope = hparams.n_rot();
-                    const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
-
-                    const int64_t q_lora_rank  = hparams.n_lora_q;
-                    const int64_t kv_lora_rank = hparams.n_lora_kv;
-
-                    const int64_t n_ff_exp        = hparams.n_ff_exp;
-                    const int64_t n_expert_shared = hparams.n_expert_shared;
-
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    // try to load output.weight, if not found, use token_embd (tied embeddings)
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    if (!output) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        int flags = 0;
-                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
-                            // skip all tensors in the NextN layers
-                            // TODO @ngxson : TENSOR_NOT_REQUIRED was a hack, need to remove it later
-                            flags |= TENSOR_SKIP | TENSOR_NOT_REQUIRED;
-                        }
-
-                        auto & layer = layers[i];
-
-                        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
-                        layer.attn_q_a_norm  = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, flags);
-                        layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, flags);
-
-                        layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, flags);
-                        layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, flags);
-
-                        layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, flags);
-
-                        // note: only old legacy GGUF files will have the unsplit wkv_b tensor in
-                        layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, flags);
-                        layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, flags);
-
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, flags);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
-
-                        // DSA indexer
-                        layer.indexer_k_norm   = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM,   "weight", i), {hparams.indexer_head_size}, flags);
-                        layer.indexer_k_norm_b = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM,   "bias",   i), {hparams.indexer_head_size}, flags);
-                        layer.indexer_proj     = create_tensor(tn(LLM_TENSOR_INDEXER_PROJ,     "weight", i), {n_embd, hparams.indexer_n_head}, flags);
-                        layer.indexer_attn_k   = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_K,   "weight", i), {n_embd, hparams.indexer_head_size}, flags);
-                        layer.indexer_attn_q_b = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.indexer_n_head * hparams.indexer_head_size}, flags);
-                        if (i < (int) hparams.n_layer_dense_lead) {
-                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, flags);
-                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, flags);
-                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, flags);
-                        } else {
-                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
-                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
-
-                            if (n_expert == 0) {
-                                throw std::runtime_error("n_expert must be > 0");
-                            }
-                            if (n_expert_used == 0) {
-                                throw std::runtime_error("n_expert_used must be > 0");
-                            }
-
-                            // MoE branch
-                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, flags);
-                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, flags);
-                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, flags);
-
-                            // Shared expert branch
-                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags);
-                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, flags);
-                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags);
-                        }
-
-                        // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
-                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
-                            layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
-                            layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
-                            layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
-
-                            // Optional tensors
-                            layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
-                            layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
-                            layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED);
-                        }
-                    }
-                } break;
-            case LLM_ARCH_NEMOTRON:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
-                    output        = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
-                        // optional bias tensors
-                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
-
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-
-                        // optional MLP bias
-                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
-                    }
-                } break;
-            case LLM_ARCH_NEMOTRON_H:
-            case LLM_ARCH_NEMOTRON_H_MOE:
-                {
-                    // mamba2 Mixer SSM params
-                    // NOTE: int64_t for tensor dimensions
-                    const int64_t d_conv     = hparams.ssm_d_conv;
-                    const int64_t d_inner    = hparams.ssm_d_inner;
-                    const int64_t d_state    = hparams.ssm_d_state;
-                    const int64_t n_ssm_head = hparams.ssm_dt_rank;
-                    const int64_t n_group    = hparams.ssm_n_group;
-                    const int64_t d_in_proj  = 2*d_inner + 2*n_group*d_state + n_ssm_head;
-                    const int64_t moe_n_embd = hparams.moe_latent_size > 0 ? hparams.moe_latent_size : n_embd;
-
-                    // embeddings
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    {
-                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                        output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                        // if output is NULL, init from the input tok embed, duplicated to allow offloading
-                        if (output == NULL) {
-                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                        }
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        // all blocks use the attn norm
-                        layer.attn_norm  = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        if (hparams.is_recurrent(i)) {
-                            // ssm layers
-                            layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
-
-                            layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
-                            layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
-
-                            layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
-
-                            // no "weight" suffix for these
-                            layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
-                            layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
-
-                            layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
-
-                            // out_proj
-                            layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
-                        } else if (hparams.n_ff(i) == 0) {
-                            // attention layers (with optional bias)
-                            const int64_t n_head_i = hparams.n_head(i);
-                            const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
-                            const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
-                            create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head_i, n_embd_k_gqa_i, n_embd_v_gqa_i, 0);
-                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
-                            layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
-                        }  else {
-                            if (n_expert != 0) {
-                                const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
-                                const int64_t n_ff_shexp = hparams.n_ff_shexp;
-
-                                layer.ffn_gate_inp    = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), { n_embd, n_expert}, 0);
-                                layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert         }, 0);
-
-                                // MoE branch
-                                layer.ffn_latent_down = create_tensor(tn(LLM_TENSOR_FFN_LATENT_DOWN, "weight", i), {n_embd, moe_n_embd}, TENSOR_NOT_REQUIRED);
-                                layer.ffn_latent_up   = create_tensor(tn(LLM_TENSOR_FFN_LATENT_UP,   "weight", i), {moe_n_embd, n_embd}, TENSOR_NOT_REQUIRED);
-
-                                layer.ffn_down_exps   = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   moe_n_embd, n_expert}, 0);
-                                layer.ffn_up_exps     = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {moe_n_embd, n_ff_exp, n_expert}, 0);
-
-                                // Shared expert branch
-                                layer.ffn_down_shexp  = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
-                                layer.ffn_up_shexp    = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_shexp}, 0);
-
-                            } else {
-                                // mlp layers
-                                layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  hparams.n_ff(i), n_embd}, 0);
-                                layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   hparams.n_ff(i)}, 0);
-                                layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
-                                layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias",   i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
-                            }
-                        }
-                    }
-                } break;
-            case LLM_ARCH_EXAONE:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
-
-                        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM,   "weight", i), {n_embd}, 0);
-                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-                        layer.ffn_gate   = create_tensor(tn(LLM_TENSOR_FFN_GATE,   "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN,   "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,     "weight", i), {n_embd,   n_ff}, 0);
-                    }
-                } break;
-            case LLM_ARCH_EXAONE4:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
-                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-
-                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
-                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
-                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
-
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_post_norm  = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
-                    }
-                } break;
-            case LLM_ARCH_EXAONE_MOE:
-                {
-                    const int64_t n_ff_exp       = hparams.n_ff_exp;
-                    const int64_t n_expert       = hparams.n_expert;
-                    const int64_t n_expert_used  = hparams.n_expert_used;
-                    const int64_t n_ff_shexp     = hparams.n_ff_shexp > 0 ? hparams.n_ff_shexp : n_ff_exp;
-                    const int64_t head_dim       = hparams.n_embd_head_k();
-                    const int64_t n_qo_dim       = n_head * head_dim;
-                    const int64_t n_kv_dim       = n_head_kv * head_dim;
-
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
-
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        int flags = 0;
-                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
-                            // skip all tensors in the NextN layers
-                            flags |= TENSOR_SKIP;
-                        }
-
-                        auto & layer = layers[i];
-                        create_tensor_qkv(layer, i, n_embd, n_qo_dim, n_kv_dim, n_kv_dim, flags);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, flags);
-
-                        layer.rope_freqs   = create_tensor(tn(LLM_TENSOR_ROPE_FREQS,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0) | flags);
-
-                        layer.attn_norm    = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, flags);
-                        layer.attn_q_norm  = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags);
-                        layer.attn_k_norm  = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags);
-
-                        layer.ffn_norm     = create_tensor(tn(LLM_TENSOR_FFN_NORM,    "weight", i), {n_embd}, flags);
-
-                        // dense layers for first n_layer_dense_lead layers or nextn_predict_layers layers at the end
-                        if (i < (int) hparams.n_layer_dense_lead || (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers)) {
-                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
-                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, flags);
-                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, flags);
-                        } else {
-                            layer.ffn_gate_inp    = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, flags);
-                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED | flags);
-
-                            if (n_expert == 0) {
-                                throw std::runtime_error("n_expert must be > 0");
-                            }
-                            if (n_expert_used == 0) {
-                                throw std::runtime_error("n_expert_used must be > 0");
-                            }
-
-                            layer.ffn_gate_exps  = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS,  "weight", i), {n_embd, n_ff_exp, n_expert}, flags);
-                            layer.ffn_down_exps  = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS,  "weight", i), {n_ff_exp, n_embd, n_expert}, flags);
-                            layer.ffn_up_exps    = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,    "weight", i), {n_embd, n_ff_exp, n_expert}, flags);
-
-                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
-                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, flags);
-                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_shexp}, flags);
-                        }
-
-                        // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
-                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
-                            layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), {2 * n_embd, n_embd}, flags);
-                            layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM,   "weight", i), {n_embd}, flags);
-                            layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM,   "weight", i), {n_embd}, flags);
-
-                            layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), {n_embd}, flags | TENSOR_NOT_REQUIRED);
-                            layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS,     "weight", i), {n_embd, n_vocab}, flags | TENSOR_NOT_REQUIRED);
-                            layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), {n_embd, n_vocab}, flags | TENSOR_NOT_REQUIRED);
-                        }
-                    }
-                } break;
-            case LLM_ARCH_RWKV6:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // Block 0, LN0
-                    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight", 0), {n_embd}, 0);
-                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias",   0), {n_embd}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
-                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
-
-                    const int time_mix_extra_dim = hparams.time_mix_extra_dim;
-                    const int time_decay_extra_dim = hparams.time_decay_extra_dim;
-                    const int head_size = hparams.wkv_head_size;
-                    const int attn_hidden_size = n_embd;
-                    const int ffn_size = hparams.n_ff_arr[0];
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
-
-                        layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
-                        layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, 0);
-
-                        layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
-                        layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
-
-                        layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
-                        layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
-                        layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
-                        layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
-                        layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
-                        layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
-                        layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, TENSOR_NOT_REQUIRED);
-                        GGML_ASSERT(!(layer.time_mix_lerp_fused == NULL && layer.time_mix_lerp_w == NULL));
-
-                        layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0);
-                        layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
-                        layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
-                        layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
-                        layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
-                        layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
-                        layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
-                        layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
-
-                        layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
-                        layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
-                        layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
-
-                        layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
-                        layer.channel_mix_lerp_r = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);
-
-                        layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
-                        layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
-                        layer.channel_mix_receptance = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd}, 0);
-                    }
-
-                } break;
-            case LLM_ARCH_RWKV6QWEN2:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
-                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
-
-                    const int time_mix_extra_dim = hparams.time_mix_extra_dim;
-                    const int time_decay_extra_dim = hparams.time_decay_extra_dim;
-                    const int head_size = hparams.wkv_head_size;
-                    const int attn_hidden_size = n_embd;
-                    const int n_head_kv = hparams.n_head_kv();
-                    int attn_key_value_size;
-                    if (n_head_kv == 0 || attn_hidden_size / head_size == n_head_kv) {
-                        attn_key_value_size = attn_hidden_size;
-                    } else {
-                        attn_key_value_size = n_head_kv * head_size;
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
-                        layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
-
-                        layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
-                        layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
-
-                        layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, TENSOR_NOT_REQUIRED);
-                        layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
-                        layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
-                        layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
-                        layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {n_embd, attn_key_value_size}, 0);
-                        layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {n_embd, attn_key_value_size}, 0);
-                        layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
-                        layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
-                        // optional bias tensors
-                        layer.time_mix_key_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
-                        layer.time_mix_value_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
-                        layer.time_mix_receptance_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "bias", i), {attn_hidden_size}, TENSOR_NOT_REQUIRED);
-
-                        layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                    }
-                } break;
-            case LLM_ARCH_RWKV7:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // Block 0, LN0
-                    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight", 0), {n_embd}, 0);
-                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias",   0), {n_embd}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
-                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
-
-                    const int n_lora_decay = hparams.n_lora_decay;
-                    const int n_lora_iclr = hparams.n_lora_iclr;
-                    const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
-                    const int n_lora_gate = hparams.n_lora_gate;
-                    const int attn_hidden_size = n_embd;
-                    const int ffn_size = hparams.n_ff_arr[0];
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
-
-                        layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
-                        layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, 0);
-
-                        layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
-                        layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
-                        layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
-
-                        layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
-                        layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
-                        layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
-
-                        if (i == 0) {
-                            // actually not used
-                            layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
-                            layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
-                            layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
-                        } else {
-                            layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
-                            layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
-                            layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
-                        }
-
-                        layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, 0);
-                        layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, 0);
-
-                        layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
-
-                        layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
-                        layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
-                        layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
-
-                        layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
-                        layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
-                        layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
-
-                        layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
-                        layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
-                        layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
-
-                        layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
-
-                        layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
-                        layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
-                    }
-
-                } break;
-            case LLM_ARCH_ARWKV7:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
-
-                    const int n_lora_decay = hparams.n_lora_decay;
-                    const int n_lora_iclr = hparams.n_lora_iclr;
-                    const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
-                    const int n_lora_gate = hparams.n_lora_gate;
-                    const int attn_hidden_size = n_embd;
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
-                        layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
-                        layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
-
-                        layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
-                        layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
-                        layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
-
-                        if (i == 0) {
-                            // actually not used
-                            layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
-                            layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
-                            layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
-                        } else {
-                            layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
-                            layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
-                            layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
-                        }
-
-                        layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, TENSOR_NOT_REQUIRED);
-                        layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, TENSOR_NOT_REQUIRED);
-
-                        try {
-                            layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
-                        } catch(std::runtime_error & e) {
-                            // ARWKV models may not have gate tensors
-                            layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
-                        }
-
-                        layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
-                        layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
-                        layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
-
-                        layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
-                        layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
-                        layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
-
-                        layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
-                        layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
-                        layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                    }
-
-                } break;
-            case LLM_ARCH_CHAMELEON:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
-                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
-                        layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i),  {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
-                        layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i),  {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                    }
-                } break;
-            case LLM_ARCH_WAVTOKENIZER_DEC:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd, n_vocab}, 0);
-
-                    conv1d   = create_tensor(tn(LLM_TENSOR_CONV1D, "weight", 0), {7, hparams.n_embd, hparams.posnet.n_embd}, 0);
-                    conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias",   0), {1, hparams.posnet.n_embd}, 0);
-
-                    // posnet
-                    {
-                        const int64_t n_embd = hparams.posnet.n_embd;
-
-                        for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
-                            auto & layer = layers[i].posnet;
-
-                            // posnet:
-                            //
-                            //  - resnet
-                            //  - resnet
-                            //  - attn
-                            //  - resnet
-                            //  - resnet
-                            //  - norm
-                            //
-                            switch (i) {
-                                case 0:
-                                case 1:
-                                case 3:
-                                case 4:
-                                    {
-                                        layer.norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
-                                        layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   i), {1, n_embd}, 0);
-
-                                        layer.conv1   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
-                                        layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   i), {1, n_embd}, 0);
-
-                                        layer.norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
-                                        layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   i), {1, n_embd}, 0);
-
-                                        layer.conv2   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
-                                        layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   i), {1, n_embd}, 0);
-                                    } break;
-                                case 2:
-                                    {
-                                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
-                                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   i), {1, n_embd}, 0);
-
-                                        layer.attn_q      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q,    "weight", i), {1, n_embd, n_embd}, 0);
-                                        layer.attn_q_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q,    "bias",   i), {1, n_embd}, 0);
-
-                                        layer.attn_k      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K,    "weight", i), {1, n_embd, n_embd}, 0);
-                                        layer.attn_k_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K,    "bias",   i), {1, n_embd}, 0);
-
-                                        layer.attn_v      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V,    "weight", i), {1, n_embd, n_embd}, 0);
-                                        layer.attn_v_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V,    "bias",   i), {1, n_embd}, 0);
-
-                                        layer.attn_o      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT,  "weight", i), {1, n_embd, n_embd}, 0);
-                                        layer.attn_o_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT,  "bias",   i), {1, n_embd}, 0);
-                                    } break;
-                                case 5:
-                                    {
-                                        layer.norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
-                                        layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   i), {1, n_embd}, 0);
-                                    } break;
-                                default: GGML_ABORT("unknown posnet layer");
-                            };
-                        }
-                    }
-
-                    GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
-
-                    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight", 0), {hparams.posnet.n_embd}, 0);
-                    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias",   0), {hparams.posnet.n_embd}, 0);
-
-                    // convnext
-                    {
-                        const int64_t n_embd = hparams.convnext.n_embd;
-
-                        for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
-                            auto & layer = layers[i].convnext;
-
-                            layer.dw     = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW,    "weight", i), {7, 1, n_embd}, 0);
-                            layer.dw_b   = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW,    "bias",   i), {1, n_embd}, 0);
-
-                            layer.norm   = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM,  "weight", i), {n_embd}, 0);
-                            layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM,  "bias",   i), {n_embd}, 0);
-
-                            layer.pw1    = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1,   "weight", i), {n_embd, n_ff}, 0);
-                            layer.pw1_b  = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1,   "bias",   i), {n_ff}, 0);
-
-                            layer.pw2    = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2,   "weight", i), {n_ff, n_embd}, 0);
-                            layer.pw2_b  = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2,   "bias",   i), {n_embd}, 0);
-
-                            layer.gamma  = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
-                        }
-
-                        // output
-                        output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                        output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
-                    }
-
-                    output   = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, hparams.n_embd_out()}, 0);
-                    output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"),   {hparams.n_embd_out()}, 0);
-                } break;
-            case LLM_ARCH_BAILINGMOE:
-                {
-                    const int64_t n_ff_exp            = hparams.n_ff_exp;
-                    const int64_t n_expert_shared     = hparams.n_expert_shared;
-
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_head * n_rot, n_head_kv * n_rot, n_head_kv * n_rot, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
-
-                        if (n_expert == 0) {
-                            throw std::runtime_error("n_expert must be > 0");
-                        }
-                        if (n_expert_used == 0) {
-                            throw std::runtime_error("n_expert_used must be > 0");
-                        }
-
-                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
-                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
-                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
-
-                        layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
-                        layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, 0);
-                        layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
-                    }
-                } break;
-            case LLM_ARCH_BAILINGMOE2:
-                {
-                    const int64_t n_ff_exp        = hparams.n_ff_exp;
-                    const int64_t n_expert_shared = hparams.n_expert_shared;
-
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
-
-                    GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for bailingmoe2");
-                    GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for bailingmoe2");
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        int flags = 0;
-                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
-                            // skip all tensors in the NextN layers
-                            flags |= TENSOR_SKIP;
-                        }
-
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
-
-                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, flags);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, flags);
-
-                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags);
-                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
-
-                        if (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) { // MoE layers
-                            const int64_t n_ff_shexp = (hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff_exp) * n_expert_shared;
-
-                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
-                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED | flags);
-
-                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, flags);
-                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, flags);
-                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, flags);
-
-                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
-                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, flags);
-                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_shexp}, flags);
-                        } else { // Dense layers
-                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, flags);
-                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, flags);
-                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, flags);
-                        }
-
-                        // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
-                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
-                            layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
-                            layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags);
-                            layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
-                            layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
-                            layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags);
-                            layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED | flags);
-                            layer.layer_out_norm         = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, flags);
-                        }
-                    }
-                } break;
-            case LLM_ARCH_DOTS1:
-                {
-                    const int64_t n_ff_exp        = hparams.n_ff_exp;
-                    const int64_t n_expert_shared = hparams.n_expert_shared;
-
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_head_k * n_head, n_embd_head_k * n_head, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
-
-                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
-                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
-                        if (i < (int) hparams.n_layer_dense_lead) {
-                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                        } else {
-                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
-                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
-
-                            if (n_expert == 0) {
-                                throw std::runtime_error("n_expert must be > 0");
-                            }
-                            if (n_expert_used == 0) {
-                                throw std::runtime_error("n_expert_used must be > 0");
-                            }
-
-                            // MoE branch
-                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
-                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
-                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
-
-                            // Shared expert branch
-                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
-                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, 0);
-                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
-                        }
-                    }
-                } break;
-            case LLM_ARCH_ARCEE:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                    }
-                } break;
-            case LLM_ARCH_AFMOE:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    const int64_t n_ff_exp = hparams.n_ff_exp;
-                    const int64_t n_expert_shared = hparams.n_expert_shared;
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        // dual attention normalization
-                        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), {n_embd}, 0);
-                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
-
-                        // attention projections
-                        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
-
-                        // Q/K normalization
-                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
-                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
-
-                        // attention gating
-                        layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
-
-                        // dual ffn normalization
-                        layer.ffn_norm      = create_tensor(tn(LLM_TENSOR_FFN_NORM,      "weight", i), {n_embd}, 0);
-                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
-
-                        if (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) {
-                            // MoE layers
-                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
-                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
-
-                            // grouped expert weights
-                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
-                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
-                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
-
-                            // shared expert
-                            if (n_expert_shared > 0) {
-                                const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
-                                layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
-                                layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
-                                layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_shexp}, 0);
-                            }
-                        } else {
-                            // Dense layers
-                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
-                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
-                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
-                        }
-                    }
-                } break;
-            case LLM_ARCH_ERNIE4_5:
-            case LLM_ARCH_ERNIE4_5_MOE:
-            case LLM_ARCH_PADDLEOCR:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_gqa, n_embd_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
-
-                        // optional bias tensors
-                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
-                        if (arch == LLM_ARCH_ERNIE4_5_MOE && static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) { // MoE layers
-                            int n_ff_exp = hparams.n_ff_exp;
-
-                            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
-                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
-                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
-                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff_exp, n_embd, n_expert}, 0);
-                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff_exp, n_expert}, 0);
-
-                            // Shared expert (if present)
-                            if (hparams.n_ff_shexp > 0) {
-                                layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {    n_embd, hparams.n_ff_shexp}, 0);
-                                layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd    }, 0);
-                                layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {    n_embd, hparams.n_ff_shexp}, 0);
-                            }
-                        } else { // Dense layers
-                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                        }
-                    }
-                } break;
-            case LLM_ARCH_FALCON_H1:
-                {
-                    // Common
-                    const int64_t hidden_size = hparams.n_embd; // hidden_size
-
-                    // mamba2 Mixer SSM params
-                    const int64_t ssm_conv_kernel_size  = hparams.ssm_d_conv; // ssm_conv_kernel_size
-                    const int64_t ssm_n_groups          = hparams.ssm_n_group; // ssm_n_groups
-                    const int64_t ssm_state_size        = hparams.ssm_d_state; // ssm_state_size
-                    const int64_t ssm_intermediate_size = hparams.ssm_d_inner; // TODO expand
-                    const int64_t ssm_num_heads         = hparams.ssm_dt_rank; // ssm_num_heads
-                    const int64_t ssm_conv_dim          = ssm_intermediate_size + 2 * ssm_n_groups * ssm_state_size;
-                    const int64_t ssm_projection_size   = ssm_intermediate_size + ssm_conv_dim + ssm_num_heads;
-
-                    // attn params
-                    const int64_t attn_num_attention_head = hparams.n_head(0); // rename to: attn_num_attention_head
-                    const int64_t attn_num_key_value_head = hparams.n_head_kv(0);
-
-                    // ffn params
-                    const int64_t ffn_intermediate_size = hparams.n_ff(0);
-
-                    // embeddings
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, 0);
-
-                    // output
-                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hidden_size, n_vocab}, TENSOR_NOT_REQUIRED);
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {hidden_size}, 0);
-
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        /*SSM LAYERS*/
-                        // ssm in
-                        layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {hidden_size, ssm_projection_size}, 0);
-                        // ssm 1d conv
-                        layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {ssm_conv_kernel_size, ssm_conv_dim}, 0);
-                        layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {ssm_conv_dim}, TENSOR_NOT_REQUIRED);
-                        // ssm_dt
-                        layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {ssm_num_heads}, 0);
-                        // no "weight" suffix for these
-                        layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, ssm_num_heads}, 0);
-                        layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, ssm_num_heads}, 0);
-                        // ssm_norm
-                        layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, TENSOR_NOT_REQUIRED);
-                        // out_proj
-                        layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {ssm_intermediate_size, hidden_size}, 0);
-
-                        /*ATTENTION LAYERS*/
-                        // attention layers (with optional bias)
-                        create_tensor_qkv(layer, i, hidden_size, n_embd_head_k * attn_num_attention_head, attn_num_key_value_head * n_embd_head_k, attn_num_key_value_head * n_embd_head_v, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * attn_num_attention_head, hidden_size}, 0);
-                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {hidden_size}, 0);
-
-
-                        // feed forward (w/ optional biases)
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, i), {hidden_size}, 0);
-                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {hidden_size,   ffn_intermediate_size}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  ffn_intermediate_size, hidden_size}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {hidden_size,   ffn_intermediate_size}, 0);
-
-                        layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
-                    }
-                } break;
-            case LLM_ARCH_HUNYUAN_MOE:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-                        const uint32_t n_ff_shexp = hparams.n_ff_shexp > 0 ? hparams.n_ff_shexp : hparams.n_ff(i);
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
-
-                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
-                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
-                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, 0);
-                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
-                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
-
-                        layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
-                        layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_shexp}, 0);
-                        layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
-                    }
-                } break;
-            case LLM_ARCH_HUNYUAN_VL:
-            case LLM_ARCH_HUNYUAN_DENSE:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
-
-                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
-                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-
-                    }
-                } break;
-            case LLM_ARCH_SMOLLM3:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                    }
-                } break;
-            case LLM_ARCH_OPENAI_MOE:
-                {
-                    const int64_t n_ff_exp = hparams.n_ff_exp;
-
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), {n_embd}, 0);
-                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_head * n_rot, n_head_kv * n_rot, n_head_kv * n_rot, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
-
-                        layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, 0);
-
-                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {  n_embd, n_expert}, 0);
-                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
-                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
-                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
-
-                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
-
-                        layer.ffn_gate_inp_b  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "bias", i), {n_expert}, 0);
-                        layer.ffn_gate_exps_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
-                        layer.ffn_down_exps_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "bias", i), {  n_embd, n_expert}, 0);
-                        layer.ffn_up_exps_b   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "bias", i), {n_ff_exp, n_expert}, 0);
-                    }
-                } break;
-            case LLM_ARCH_LFM2:
-            case LLM_ARCH_LFM2MOE:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM_LFM2, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,           "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        const bool is_moe_layer = i >= static_cast<int>(hparams.n_layer_dense_lead);
-
-                        // ffn/moe is same for transformer and conv layers
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        if (is_moe_layer) {
-                            GGML_ASSERT(n_expert && n_expert_used);
-                            layer.ffn_gate_inp    = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i),  {n_embd, n_expert}, 0);
-                            layer.ffn_gate_exps   = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, hparams.n_ff_exp, n_expert}, 0);
-                            layer.ffn_down_exps   = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {hparams.n_ff_exp,   n_embd, n_expert}, 0);
-                            layer.ffn_up_exps     = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i),   {n_embd, hparams.n_ff_exp, n_expert}, 0);
-                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
-                        } else {  // dense
-                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                        }
-
-                        // for operator_norm
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        if (!hparams.is_recurrent(i)) {
-                            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
-                            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
-                            GGML_ASSERT(n_embd_v_gqa == n_embd_k_gqa);
-
-                            create_tensor_qkv(layer, i, n_embd, n_embd, hparams.n_embd_k_gqa(i), hparams.n_embd_v_gqa(i), 0);
-
-                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-                        } else {
-                            layer.shortconv.conv     = create_tensor(tn(LLM_TENSOR_SHORTCONV_CONV,    "weight", i), {hparams.n_shortconv_l_cache, n_embd}, 0);
-                            layer.shortconv.in_proj  = create_tensor(tn(LLM_TENSOR_SHORTCONV_INPROJ,  "weight", i), {n_embd, 3 * n_embd}, 0);
-                            layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
-                        }
-                    }
-
-                    // for LFM2-ColBert-350M
-                    dense_2_out_layers   = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.n_embd_out()}, TENSOR_NOT_REQUIRED);
-                    dense_2_out_layers_b = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "bias"),   {hparams.n_embd_out()        }, TENSOR_NOT_REQUIRED);
-                } break;
-            case LLM_ARCH_SMALLTHINKER:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_gqa, n_embd_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
-
-                        GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for SMALLTHINKER");
-                        GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for SMALLTHINKER");
-
-                        // MoE branch
-                        const int64_t n_ff_exp = hparams.n_ff_exp;
-                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
-                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
-                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
-                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
-                    }
-                } break;
-            case LLM_ARCH_GROVEMOE:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for GROVEMOE");
-                    GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for GROVEMOE");
-                    GGML_ASSERT(hparams.n_group_experts > 0 && "n_group_experts must be > 0 for GROVEMOE");
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_gqa, n_embd_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
-
-                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
-                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
-
-                        // MoE branch
-                        const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
-                        const int64_t n_ff_chexp = hparams.n_ff_chexp ? hparams.n_ff_chexp : n_embd_head_k;
-                        const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;
-
-                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
-                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
-                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
-
-                        layer.ffn_gate_chexps = create_tensor(tn(LLM_TENSOR_FFN_GATE_CHEXPS, "weight", i), {  n_embd, n_ff_chexp, n_chunk_expert}, 0);
-                        layer.ffn_down_chexps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_CHEXPS, "weight", i), {n_ff_chexp,   n_embd, n_chunk_expert}, 0);
-                        layer.ffn_up_chexps   = create_tensor(tn(LLM_TENSOR_FFN_UP_CHEXPS,   "weight", i), {  n_embd, n_ff_chexp, n_chunk_expert}, 0);
-                    }
-                } break;
-            case LLM_ARCH_APERTUS:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), { n_embd, n_vocab }, 0);
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
-
-                        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
-                            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-                            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-                        } else {
-                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-                        }
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_gqa, n_embd_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
-
-                        // optional bias tensors
-                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
-                        layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
-
-                        // Q and K layernorms for Apertus
-                        layer.attn_q_norm   = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
-                        layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), { n_embd_head_k }, TENSOR_NOT_REQUIRED);
-                        layer.attn_k_norm   = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
-                        layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), { n_embd_head_k }, TENSOR_NOT_REQUIRED);
-                    }
-                } break;
-            case LLM_ARCH_MINIMAX_M2:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_gqa, n_embd_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k * n_head}, 0);
-                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_k_gqa}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
-                        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
-                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff,   n_expert}, 0);
-                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert}, 0);
-                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff,   n_expert}, 0);
-                        layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
-                    }
-                } break;
-            case LLM_ARCH_KIMI_LINEAR:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        // Check for KDA specific tensors to determine layer type or if it's a mixed model
-                        // Assuming KDA layer if KDA tensors are present
-
-                        // KDA uses head_dim = 128 (from linear_attn_config.head_dim)
-                        const int64_t n_embd_head_k_kda = hparams.n_embd_head_kda;
-                        const int64_t n_embd_head_v_kda = hparams.n_embd_head_kda;
-                        const int64_t ssm_d_conv = hparams.ssm_d_conv;
-
-                        if (hparams.is_recurrent(i)) {
-                            // Conv1d weights: try 4D first, then 3D (quantization may remove trailing 1)
-                            // 4D: [d_conv, 1, d_inner, 1], 3D: [d_conv, 1, d_inner]
-                            layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
-                            if (!layer.ssm_q_conv) {
-                                layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head}, 0);
-                            }
-
-                             // KDA Layer - Conv1d weights may be 3D or 4D
-                             layer.ssm_k_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_K, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
-                             if (!layer.ssm_k_conv) {
-                                 layer.ssm_k_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_K, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head}, 0);
-                             }
-                             layer.ssm_v_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_V, "weight", i), {ssm_d_conv, 1, n_embd_head_v_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
-                             if (!layer.ssm_v_conv) {
-                                 layer.ssm_v_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_V, "weight", i), {ssm_d_conv, 1, n_embd_head_v_kda * n_head}, 0);
-                             }
-
-                             // q, k, v projections
-                             // Python: q_proj, k_proj, v_proj
-                             create_tensor_qkv(layer, i, n_embd, n_embd_head_k_kda * n_head, n_embd_head_k_kda * n_head, n_embd_head_v_kda * n_head, 0);
-
-                             // KDA specific projections
-                             // f_a_proj, f_b_proj
-                             layer.ssm_f_a = create_tensor(tn(LLM_TENSOR_SSM_F_A, "weight", i), {n_embd, n_embd_head_k_kda}, 0); // head_dim
-                             layer.ssm_f_b = create_tensor(tn(LLM_TENSOR_SSM_F_B, "weight", i), {n_embd_head_k_kda, n_embd_head_k_kda * n_head}, 0); // projection_size
-
-                             // b_proj (beta mixing coefficient)
-                             layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", i), {n_embd, n_head}, 0);
-
-                             // A_log - Shape in GGUF: [1, num_heads, 1, 1] (4D) or [1, num_heads] (2D after quantization) Note: -exp(A_log) is applied in convert_hf_to_gguf.py
-                             layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head, 1, 1}, TENSOR_NOT_REQUIRED);
-                             if (!layer.ssm_a) {
-                                 layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0);
-                             }
-
-                             // dt_bias - shape [n_embd_head_k_kda * n_head] = [4096]
-                             layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_embd_head_k_kda * n_head}, 0);
-
-                             // g_a_proj, g_b_proj (output gate)
-                             layer.ssm_g_a = create_tensor(tn(LLM_TENSOR_SSM_G_A, "weight", i), {n_embd, n_embd_head_k_kda}, 0);
-                             layer.ssm_g_b = create_tensor(tn(LLM_TENSOR_SSM_G_B, "weight", i), {n_embd_head_k_kda, n_embd_head_k_kda * n_head}, 0);
-
-                             // o_norm (reusing SSM_NORM)
-                             layer.ssm_o_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {n_embd_head_k_kda}, 0); // FusedRMSNormGated
-
-                             // o_proj
-                             layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v_kda * n_head, n_embd}, 0);
-
-                        } else {
-                             // MLA Layer - use MLA-specific head dimensions
-                             const int64_t q_lora_rank  = hparams.n_lora_q;
-                             const int64_t kv_lora_rank = hparams.n_lora_kv;
-                             const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
-                             const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
-
-                             layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, TENSOR_NOT_REQUIRED);
-                             layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
-
-                             if (layer.attn_q_a_norm) {
-                                 layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
-                                 layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
-                             } else {
-                                 // Kimi MLA without Q compression: wq = [n_embd, n_head * n_embd_head_k_mla]
-                                 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
-                             }
-
-                             // Kimi: qk_rope_head_dim = 64 (actual RoPE dimension for MLA)
-                             // Note: hparams.n_rot may be 72 (from conversion) but actual is 64
-                             const int64_t qk_rope_head_dim = hparams.n_rot();  // From config: qk_rope_head_dim
-                             layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + qk_rope_head_dim}, 0);
-                             // Support Legacy GGUFs that don't split wkv_b (MLA KV cache disabled)
-                             layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i),
-                                {kv_lora_rank, n_head * (n_embd_head_k_mla - qk_rope_head_dim + n_embd_head_v_mla)}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
-                             if (!layer.wkv_b) { // MLA KV cache enabled
-                                 layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_k_mla - qk_rope_head_dim, kv_lora_rank, n_head}, 0);
-                                 layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
-                             }
-                             layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
-                        }
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
-                        // MoE intermediate size (different from dense FFN)
-                        const int64_t n_ff_exp = hparams.n_ff_exp;
-
-                        // Kimi uses n_layer_dense_lead to determine which layers use dense FFN vs MoE
-                        // first_k_dense_replace = 1 means layer 0 uses dense FFN, layers 1+ use MoE
-                        if (i < (int) hparams.n_layer_dense_lead) {
-                            // Dense FFN layer - use normal n_ff
-                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
-                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
-                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
-                        } else {
-                            // MoE layer - use n_ff_exp (1024) instead of n_ff (9216)
-                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
-                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
-                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
-                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
-
-                            // Shared experts use moe_intermediate_size * num_shared_experts
-                            // Kimi: shared_expert_intermediate_size = 1024 * 1 = 1024
-                            // Tensors are 2D: [n_embd, n_ff_shexp] or [n_ff_shexp, n_embd]
-                            const int64_t n_ff_shexp_actual = n_ff_exp * (hparams.n_expert_shared > 0 ? hparams.n_expert_shared : 1);
-                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp_actual}, TENSOR_NOT_REQUIRED);
-                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp_actual, n_embd}, TENSOR_NOT_REQUIRED);
-                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_shexp_actual}, TENSOR_NOT_REQUIRED);
-
-                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
-                        }
-                    }
-                } break;
-            case LLM_ARCH_COGVLM:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd_head_k * n_head * 3}, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
-
-                        layer.visexp_attn_wqkv = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_QKV, "weight", i), {n_embd, n_embd_head_k * n_head * 3}, 0);
-                        layer.visexp_attn_wo = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
-
-                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-
-                        layer.visexp_ffn_gate = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                        layer.visexp_ffn_down = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.visexp_ffn_up   = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                    }
-                } break;
-            case LLM_ARCH_PANGU_EMBED:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        // weight tensors
-                        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
-
-                        // bias tensors
-                        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
-                        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
-                            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-                            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-                        } else {
-                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-                        }
-
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                    }
-                } break;
-            case LLM_ARCH_QWEN3NEXT:
-                {
-                    if (n_expert == 0) {
-                        throw std::runtime_error(arch_name() + " model cannot have zero experts");
-                    }
-
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
-                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
-
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
-                    }
-
-                    const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
-
-                    // Calculate dimensions from hyperparameters
-                    const int64_t head_k_dim = hparams.ssm_d_state;
-                    const int64_t head_v_dim = hparams.ssm_d_state;
-                    const int64_t n_k_heads  = hparams.ssm_n_group;
-                    const int64_t n_v_heads  = hparams.ssm_dt_rank;
-                    const int64_t key_dim    = head_k_dim * n_k_heads;
-                    const int64_t value_dim  = head_v_dim * n_v_heads;
-                    const int64_t conv_dim   = key_dim * 2 + value_dim;
-
-                    // Calculate projection sizes
-                    const int64_t qkvz_dim = key_dim * 2 + value_dim * 2;
-                    const int64_t ba_dim   = n_v_heads * 2;
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-                        const uint32_t n_ff_shexp = hparams.n_ff_shexp > 0 ? hparams.n_ff_shexp : hparams.n_ff(i);
-
-                        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), { n_embd }, 0);
-                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
-
-                        if (!hparams.is_recurrent(i)) {
-                            // Attention layers
-                            create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0);
-                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
-
-                            // Q/K normalization for attention layers
-                            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
-                            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
-                        } else {
-                            // Linear attention (gated delta net) specific tensors
-                            // Create tensors with calculated dimensions
-                            // note: ssm_in is used by legacy GGUF
-                            layer.ssm_in         = create_tensor(tn(LLM_TENSOR_SSM_IN,         "weight", i), { n_embd, qkvz_dim }, TENSOR_NOT_REQUIRED);
-                            layer.wqkv           = create_tensor(tn(LLM_TENSOR_ATTN_QKV,       "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
-                            layer.wqkv_gate      = create_tensor(tn(LLM_TENSOR_ATTN_GATE,      "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
-                            layer.ssm_conv1d     = create_tensor(tn(LLM_TENSOR_SSM_CONV1D,     "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
-                            layer.ssm_dt         = create_tensor(tn(LLM_TENSOR_SSM_DT,         "bias",   i), { hparams.ssm_dt_rank }, 0);
-                            layer.ssm_a          = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN,             i), { hparams.ssm_dt_rank }, 0);
-                            layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_dim }, 0);
-                            layer.ssm_norm       = create_tensor(tn(LLM_TENSOR_SSM_NORM,       "weight", i), { head_v_dim }, 0);
-                            layer.ssm_out        = create_tensor(tn(LLM_TENSOR_SSM_OUT,        "weight", i), { value_dim, n_embd }, 0);
-                        }
-
-                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), { n_embd, n_expert }, 0);
-                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
-                        create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0);
-
-                        // Shared experts
-                        layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
-                        layer.ffn_gate_shexp     = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP,     "weight", i), { n_embd, n_ff_shexp }, 0);
-                        layer.ffn_up_shexp       = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,       "weight", i), { n_embd, n_ff_shexp }, 0);
-                        layer.ffn_down_shexp     = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP,     "weight", i), { n_ff_shexp, n_embd }, 0);
-                    }
-                } break;
-            case LLM_ARCH_QWEN35MOE:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
-                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
-
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
-                    }
-
-                    const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
-
-                    // Calculate dimensions from hyperparameters
-                    const int64_t head_k_dim = hparams.ssm_d_state;
-                    const int64_t head_v_dim = hparams.ssm_d_state;
-                    const int64_t n_k_heads  = hparams.ssm_n_group;
-                    const int64_t n_v_heads  = hparams.ssm_dt_rank;
-                    const int64_t key_dim    = head_k_dim * n_k_heads;
-                    const int64_t value_dim  = head_v_dim * n_v_heads;
-                    const int64_t conv_dim   = key_dim * 2 + value_dim;
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), { n_embd }, 0);
-                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
-
-                        if (!hparams.is_recurrent(i)) {
-                            // Attention layers
-                            create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0);
-                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
-
-                            // Q/K normalization for attention layers
-                            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
-                            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
-                        } else {
-                            // Linear attention (gated delta net) specific tensors
-                            // Create tensors with calculated dimensions
-                            layer.wqkv           = create_tensor(tn(LLM_TENSOR_ATTN_QKV,       "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
-                            layer.wqkv_gate      = create_tensor(tn(LLM_TENSOR_ATTN_GATE,      "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
-                            layer.ssm_conv1d     = create_tensor(tn(LLM_TENSOR_SSM_CONV1D,     "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
-                            layer.ssm_dt         = create_tensor(tn(LLM_TENSOR_SSM_DT,         "bias",   i), { hparams.ssm_dt_rank }, 0);
-                            layer.ssm_a          = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN,             i), { hparams.ssm_dt_rank }, 0);
-                            layer.ssm_beta       = create_tensor(tn(LLM_TENSOR_SSM_BETA,       "weight", i), { n_embd, n_v_heads }, 0);
-                            layer.ssm_alpha      = create_tensor(tn(LLM_TENSOR_SSM_ALPHA,      "weight", i), { n_embd, n_v_heads }, 0);
-                            layer.ssm_norm       = create_tensor(tn(LLM_TENSOR_SSM_NORM,       "weight", i), { head_v_dim }, 0);
-                            layer.ssm_out        = create_tensor(tn(LLM_TENSOR_SSM_OUT,        "weight", i), { value_dim, n_embd }, 0);
-                        }
-
-                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), { n_embd, n_expert }, 0);
-                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
-                        create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0);
-
-                        // Shared experts
-                        const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
-
-                        layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
-                        layer.ffn_gate_shexp     = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP,     "weight", i), { n_embd, n_ff_shexp }, 0);
-                        layer.ffn_up_shexp       = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,       "weight", i), { n_embd, n_ff_shexp }, 0);
-                        layer.ffn_down_shexp     = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP,     "weight", i), { n_ff_shexp, n_embd }, 0);
-                    }
-                } break;
-            case LLM_ARCH_QWEN35:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
-                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
-
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
-                    }
-
-                    // Calculate dimensions from hyperparameters
-                    const int64_t head_k_dim = hparams.ssm_d_state;
-                    const int64_t head_v_dim = hparams.ssm_d_state;
-                    const int64_t n_k_heads  = hparams.ssm_n_group;
-                    const int64_t n_v_heads  = hparams.ssm_dt_rank;
-                    const int64_t key_dim    = head_k_dim * n_k_heads;
-                    const int64_t value_dim  = head_v_dim * n_v_heads;
-                    const int64_t conv_dim   = key_dim * 2 + value_dim;
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), { n_embd }, 0);
-                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
-
-                        if (!hparams.is_recurrent(i)) {
-                            // Attention layers
-                            create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0);
-                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
-
-                            // Q/K normalization for attention layers
-                            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
-                            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
-                        } else {
-                            // Linear attention (gated delta net) specific tensors
-                            // Create tensors with calculated dimensions
-                            layer.wqkv           = create_tensor(tn(LLM_TENSOR_ATTN_QKV,       "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
-                            layer.wqkv_gate      = create_tensor(tn(LLM_TENSOR_ATTN_GATE,      "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
-                            layer.ssm_conv1d     = create_tensor(tn(LLM_TENSOR_SSM_CONV1D,     "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
-                            layer.ssm_dt         = create_tensor(tn(LLM_TENSOR_SSM_DT,         "bias",   i), { hparams.ssm_dt_rank }, 0);
-                            layer.ssm_a          = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN,             i), { hparams.ssm_dt_rank }, 0);
-                            layer.ssm_beta       = create_tensor(tn(LLM_TENSOR_SSM_BETA,       "weight", i), { n_embd, n_v_heads }, 0);
-                            layer.ssm_alpha      = create_tensor(tn(LLM_TENSOR_SSM_ALPHA,      "weight", i), { n_embd, n_v_heads }, 0);
-                            layer.ssm_norm       = create_tensor(tn(LLM_TENSOR_SSM_NORM,       "weight", i), { head_v_dim }, 0);
-                            layer.ssm_out        = create_tensor(tn(LLM_TENSOR_SSM_OUT,        "weight", i), { value_dim, n_embd }, 0);
-                        }
-
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                    }
-                } break;
-            case LLM_ARCH_MIMO2:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-                        uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
-                        uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
-                        uint32_t n_head = hparams.n_head(i);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_v * n_head, n_embd }, 0);
-
-                        layer.attn_norm  = create_tensor(tn(LLM_TENSOR_ATTN_NORM,  "weight", i), {n_embd}, 0);
-                        layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, TENSOR_NOT_REQUIRED);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
-                        // non-MoE branch
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
-
-                        // MoE branch
-                        int64_t n_ff_exp = hparams.n_ff_exp;
-                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp,   n_expert}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff_exp,   n_expert}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
-                    }
-                } break;
-            case LLM_ARCH_STEP35:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
-
-                    // STEP35 supports per-layer partial RoPE dims; rope factors are stored as a single shared tensor
-                    // ("rope_freqs.weight") and ggml uses only the first (n_rot_l/2) entries per layer.
-                    uint32_t n_rot_max = 0;
-                    for (int i = 0; i < n_layer; ++i) {
-                        n_rot_max = std::max(n_rot_max, hparams.n_rot(i));
-                    }
-                    if (n_rot_max == 0) {
-                        n_rot_max = n_rot;
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        const uint32_t n_head_l      = hparams.n_head(i);
-                        const uint32_t n_embd_k_gqa  = hparams.n_embd_k_gqa(i);
-                        const uint32_t n_embd_v_gqa  = hparams.n_embd_v_gqa(i);
-
-                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED);
-                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED);
-
-                        // optional rope factors (llama3) / longrope tensors
-                        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
-                            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-                            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-                        } else {
-                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-                        }
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head_l, n_embd_k_gqa, n_embd_v_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v * n_head_l, n_embd}, 0);
-
-                        // head-wise attention gate (Step35 self_attn.g_proj)
-                        layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_head_l}, TENSOR_NOT_REQUIRED);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
-                        // dense MLP (leading dense blocks)
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
-
-                        // MoE routed experts + selection bias (router_bias)
-                        const int64_t n_ff_exp = hparams.n_ff_exp;
-                        layer.ffn_gate_inp      = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_gate_exps     = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp,   n_expert}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_down_exps     = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_up_exps       = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff_exp,   n_expert}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_exp_probs_b   = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
-
-                        // shared expert MLP
-                        layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, TENSOR_NOT_REQUIRED);
-                    }
-                } break;
-            case LLM_ARCH_MAINCODER:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
-                    // if output is NULL, init from the input tok embed
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
-                    }
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
-                        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_gqa, n_embd_gqa, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
-
-                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
-                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
-
-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
-                    }
-                } break;
-            default:
-                throw std::runtime_error("unknown architecture");
-        }
-
-        // generic pass: load optional per-tensor/per-expert ".scale" tensors (e.g. NVFP4 scale2)
-        // this avoids having to add scale loading to every architecture
-        for (int i = 0; i < n_layer; ++i) {
-            auto & layer = layers[i];
-
-            // attention weight scales (per-tensor, shape {1})
-            if (!layer.wq_s && layer.wq) {
-                layer.wq_s = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "scale", i), {1}, TENSOR_NOT_REQUIRED);
-            }
-            if (!layer.wk_s && layer.wk) {
-                layer.wk_s = create_tensor(tn(LLM_TENSOR_ATTN_K,   "scale", i), {1}, TENSOR_NOT_REQUIRED);
-            }
-            if (!layer.wv_s && layer.wv) {
-                layer.wv_s = create_tensor(tn(LLM_TENSOR_ATTN_V,   "scale", i), {1}, TENSOR_NOT_REQUIRED);
-            }
-            if (!layer.wo_s && layer.wo) {
-                layer.wo_s = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
-            }
-            if (!layer.wqkv_s && layer.wqkv) {
-                layer.wqkv_s = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "scale", i), {1}, TENSOR_NOT_REQUIRED);
-            }
-            if (!layer.wqkv_gate_s && layer.wqkv_gate) {
-                layer.wqkv_gate_s = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
-            }
-
-            // dense FFN weight scales (per-tensor, shape {1})
-            if (!layer.ffn_gate_s && layer.ffn_gate) {
-                layer.ffn_gate_s = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
-            }
-            if (!layer.ffn_down_s && layer.ffn_down) {
-                layer.ffn_down_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
-            }
-            if (!layer.ffn_up_s && layer.ffn_up) {
-                layer.ffn_up_s = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
-            }
-            if (!layer.ffn_gate_shexp_s && layer.ffn_gate_shexp) {
-                layer.ffn_gate_shexp_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
-            }
-            if (!layer.ffn_down_shexp_s && layer.ffn_down_shexp) {
-                layer.ffn_down_shexp_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
-            }
-            if (!layer.ffn_up_shexp_s && layer.ffn_up_shexp) {
-                layer.ffn_up_shexp_s = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
-            }
-
-            // MoE expert weight scales (per-expert, shape {n_expert})
-            if (!layer.ffn_gate_exps_s && layer.ffn_gate_exps) {
-                layer.ffn_gate_exps_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
-            }
-            if (!layer.ffn_down_exps_s && layer.ffn_down_exps) {
-                layer.ffn_down_exps_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
-            }
-            if (!layer.ffn_up_exps_s && layer.ffn_up_exps) {
-                layer.ffn_up_exps_s = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
-            }
-
-            // recurrent / linear-attention weight scales (per-tensor, shape {1})
-            if (!layer.ssm_in_s && layer.ssm_in) {
-                layer.ssm_in_s = create_tensor(tn(LLM_TENSOR_SSM_IN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
-            }
-            if (!layer.ssm_out_s && layer.ssm_out) {
-                layer.ssm_out_s = create_tensor(tn(LLM_TENSOR_SSM_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
-            }
-            if (!layer.ssm_alpha_s && layer.ssm_alpha) {
-                layer.ssm_alpha_s = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "scale", i), {1}, TENSOR_NOT_REQUIRED);
-            }
-            if (!layer.ssm_beta_s && layer.ssm_beta) {
-                layer.ssm_beta_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "scale", i), {1}, TENSOR_NOT_REQUIRED);
-            }
-
-            // input scales
-            if (!layer.wq_in_s && layer.wq) {
-                layer.wq_in_s = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
-            }
-            if (!layer.wk_in_s && layer.wk) {
-                layer.wk_in_s = create_tensor(tn(LLM_TENSOR_ATTN_K,   "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
-            }
-            if (!layer.wv_in_s && layer.wv) {
-                layer.wv_in_s = create_tensor(tn(LLM_TENSOR_ATTN_V,   "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
-            }
-            if (!layer.wo_in_s && layer.wo) {
-                layer.wo_in_s = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
-            }
-            if (!layer.wqkv_in_s && layer.wqkv) {
-                layer.wqkv_in_s = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
-            }
-            if (!layer.wqkv_gate_in_s && layer.wqkv_gate) {
-                layer.wqkv_gate_in_s = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
-            }
-            if (!layer.ffn_gate_in_s && layer.ffn_gate) {
-                layer.ffn_gate_in_s = create_tensor(tn(LLM_TENSOR_FFN_GATE, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
-            }
-            if (!layer.ffn_down_in_s && layer.ffn_down) {
-                layer.ffn_down_in_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
-            }
-            if (!layer.ffn_up_in_s && layer.ffn_up) {
-                layer.ffn_up_in_s = create_tensor(tn(LLM_TENSOR_FFN_UP, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
-            }
-            if (!layer.ffn_gate_exps_in_s && layer.ffn_gate_exps) {
-                layer.ffn_gate_exps_in_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "input_scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
-            }
-            if (!layer.ffn_down_exps_in_s && layer.ffn_down_exps) {
-                layer.ffn_down_exps_in_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "input_scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
-            }
-            if (!layer.ffn_up_exps_in_s && layer.ffn_up_exps) {
-                layer.ffn_up_exps_in_s = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "input_scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
-            }
-            if (!layer.ffn_gate_shexp_in_s && layer.ffn_gate_shexp) {
-                layer.ffn_gate_shexp_in_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
-            }
-            if (!layer.ffn_down_shexp_in_s && layer.ffn_down_shexp) {
-                layer.ffn_down_shexp_in_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
-            }
-            if (!layer.ffn_up_shexp_in_s && layer.ffn_up_shexp) {
-                layer.ffn_up_shexp_in_s = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
-            }
-            if (!layer.ssm_in_in_s && layer.ssm_in) {
-                layer.ssm_in_in_s = create_tensor(tn(LLM_TENSOR_SSM_IN, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
-            }
-            if (!layer.ssm_out_in_s && layer.ssm_out) {
-                layer.ssm_out_in_s = create_tensor(tn(LLM_TENSOR_SSM_OUT, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
-            }
-            if (!layer.ssm_alpha_in_s && layer.ssm_alpha) {
-                layer.ssm_alpha_in_s = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
-            }
-            if (!layer.ssm_beta_in_s && layer.ssm_beta) {
-                layer.ssm_beta_in_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
-            }
-        }
-    }
-
-    ml.done_getting_tensors();
-
-    // populate tensors_by_name
-    for (auto & [_, ctx_ptr] : ml.ctx_map) {
-        for (auto * cur = ggml_get_first_tensor(ctx_ptr.get()); cur != NULL; cur = ggml_get_next_tensor(ctx_ptr.get(), cur)) {
-            tensors_by_name.emplace_back(ggml_get_name(cur), cur);
-        }
-    }
-
-    ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr);
-    pimpl->mappings.reserve(ml.mappings.size());
-
-    // create the backend buffers
-    std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_buf_maps;
-    ctx_buf_maps.reserve(ml.ctx_map.size());
-
-    // Ensure we have enough capacity for the maximum backend buffer we will potentially create
-    const size_t n_max_backend_buffer = ml.ctx_map.size() * ml.files.size();
-    pimpl->ctxs_bufs.reserve(n_max_backend_buffer);
-
-    for (auto & [buft, ctx_ptr] : ml.ctx_map) {
-        ggml_context * ctx = ctx_ptr.get();
-
-        // skip contexts without tensors
-        if (ggml_get_first_tensor(ctx) == nullptr) {
-            continue;
-        }
-
-        llama_buf_map buf_map;
-        buf_map.reserve(n_max_backend_buffer);
-
-        // check if it is possible to use buffer_from_host_ptr with this buffer type
-        ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
-        if (!dev) {
-            // FIXME: workaround for CPU backend buft having a NULL device
-            dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
-            if (!dev) {
-                throw std::runtime_error(format("%s: no CPU backend found", __func__));
-            }
-        }
-        ggml_backend_dev_props props;
-        ggml_backend_dev_get_props(dev, &props);
-        bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
-        bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
-
-        std::vector<ggml_backend_buffer_ptr> bufs;
-        if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
-            GGML_ASSERT(!ml.no_alloc);
-            for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
-                // only the mmap region containing the tensors in the model is mapped to the backend buffer
-                // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer,
-                //     then we could just use metal for all layers
-                // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
-                void * addr = nullptr;
-                size_t first, last; // NOLINT
-                ml.get_mapping_range(&first, &last, &addr, idx, ctx);
-                if (first >= last) {
-                    continue;
-                }
-                const size_t max_size = ggml_get_max_tensor_size(ctx);
-                ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
-                if (buf == nullptr) {
-                    throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
-                }
-                bufs.emplace_back(buf);
-                buf_map.emplace(idx, buf);
-            }
-        } else {
-            ggml_backend_buffer_t buf;
-            if (ml.no_alloc) {
-                buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
-                for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
-                    t->buffer = buf; // set dummy buffer for weights so that the backend scheduler won't try to allocate them
-                }
-            } else {
-                buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); // real buffer
-            }
-            if (buf == nullptr) {
-                throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
-            }
-            if (use_mlock && ggml_backend_buffer_is_host(buf)) {
-                pimpl->mlock_bufs.emplace_back(new llama_mlock);
-                auto & mlock_buf = pimpl->mlock_bufs.back();
-                mlock_buf->init   (ggml_backend_buffer_get_base(buf));
-                mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
-            }
-            bufs.emplace_back(buf);
-            for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
-                buf_map.emplace(idx, buf);
-            }
-        }
-
-        for (auto & buf : bufs) {
-            // indicate that this buffer contains weights
-            // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
-            ggml_backend_buffer_set_usage(buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
-        }
-
-        pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), std::move(bufs));
-
-        ctx_buf_maps.emplace_back(ctx, buf_map);
-    }
-
-    if (llama_supports_gpu_offload()) {
-        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
-
-        int n_repeating = n_gpu;
-        if (n_repeating > 0) {
-            LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
-            n_repeating--;
-        }
-        LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_repeating);
-
-        const int max_backend_supported_layers = hparams.n_layer + 1;
-        const int max_offloadable_layers       = hparams.n_layer + 1;
-
-        LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
-    }
-
-    // print memory requirements per buffer type
-    for (auto & [_, bufs] : pimpl->ctxs_bufs) {
-        for (auto & buf: bufs) {
-            LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n",
-                __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
-        }
-    }
-
-    if (ml.no_alloc) {
-        return true;
-    }
-
-    // load tensor data
-    for (auto & [ctx, buf_map] : ctx_buf_maps) {
-        if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
-            return false;
-        }
-    }
-
-    if (use_mmap_buffer) {
-        for (auto & mapping : ml.mappings) {
-            pimpl->mappings.emplace_back(std::move(mapping));
-        }
-    }
-
-    return true;
-}
-
-std::string llama_model::arch_name() const {
-    return llm_arch_name(arch);
-}
-
-std::string llama_model::type_name() const {
-    return llm_type_name(type);
-}
-
-std::string llama_model::desc() const {
-    return pimpl->desc_str;
-}
-
-size_t llama_model::size() const {
-    return pimpl->n_bytes;
-}
-
-size_t llama_model::n_tensors() const {
-    return tensors_by_name.size();
-}
-
-size_t llama_model::n_devices() const {
-    return devices.size();
-}
-
-const float * llama_model::tensor_split() const {
-    return params.tensor_split;
-}
-
-uint32_t llama_model::n_gpu_layers() const {
-    return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer + 1;
-}
-
-llama_split_mode llama_model::split_mode() const {
-    return params.split_mode;
-}
-
-std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
-    std::map<ggml_backend_buffer_type_t, size_t> ret;
-    for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) {
-        if (hparams.no_alloc) {
-            GGML_ASSERT(bufs.size() == 1);
-            ggml_backend_buffer_t buf = bufs[0].get();
-            GGML_ASSERT(ggml_backend_buffer_get_base(buf) == nullptr);
-            ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf);
-            ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft);
-        } else {
-            for (const auto & buf : bufs) {
-                // GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); // multi_buffer does not have a defined base
-                ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
-            }
-        }
-    }
-    return ret;
-}
-
-uint64_t llama_model::n_elements() const {
-    return pimpl->n_elements;
-}
-
-void llama_model::print_info() const {
-    const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
-
-    auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
-        bool is_var = false;
-
-        std::vector<uint32_t> v;
-        for (uint32_t i = 0; i < n; ++i) {
-            v.push_back(f(i));
-            if (v[i] != v[0]) {
-                is_var = true;
-            }
-        }
-
-        std::stringstream ss;
-
-        if (is_var) {
-            ss << "[";
-            for (uint32_t i = 0; i < n; ++i) {
-                ss << v[i];
-                if (i < n - 1) {
-                    ss << ", ";
-                }
-            }
-            ss << "]";
-        } else {
-            ss << v[0];
-        }
-
-        return ss.str();
-    };
-
-    // hparams
-    LLAMA_LOG_INFO("%s: arch                  = %s\n",     __func__, arch_name().c_str());
-    LLAMA_LOG_INFO("%s: vocab_only            = %d\n",     __func__, hparams.vocab_only);
-    LLAMA_LOG_INFO("%s: no_alloc              = %d\n",     __func__, hparams.no_alloc);
-
-    if (!hparams.vocab_only) {
-        LLAMA_LOG_INFO("%s: n_ctx_train           = %u\n",     __func__, hparams.n_ctx_train);
-        LLAMA_LOG_INFO("%s: n_embd                = %u\n",     __func__, hparams.n_embd);
-        LLAMA_LOG_INFO("%s: n_embd_inp            = %u\n",     __func__, hparams.n_embd_inp());
-        LLAMA_LOG_INFO("%s: n_layer               = %u\n",     __func__, hparams.n_layer);
-        LLAMA_LOG_INFO("%s: n_head                = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head(il);    }, hparams.n_layer).c_str());
-        LLAMA_LOG_INFO("%s: n_head_kv             = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
-        LLAMA_LOG_INFO("%s: n_rot                 = %u\n",     __func__, hparams.n_rot_full);
-        LLAMA_LOG_INFO("%s: n_swa                 = %u\n",     __func__, hparams.n_swa);
-        LLAMA_LOG_INFO("%s: is_swa_any            = %u\n",     __func__, hparams.is_swa_any());
-        LLAMA_LOG_INFO("%s: n_embd_head_k         = %u\n",     __func__, hparams.n_embd_head_k_full);
-        LLAMA_LOG_INFO("%s: n_embd_head_v         = %u\n",     __func__, hparams.n_embd_head_v_full);
-        LLAMA_LOG_INFO("%s: n_gqa                 = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il);        }, hparams.n_layer).c_str());
-        LLAMA_LOG_INFO("%s: n_embd_k_gqa          = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
-        LLAMA_LOG_INFO("%s: n_embd_v_gqa          = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
-        LLAMA_LOG_INFO("%s: f_norm_eps            = %.1e\n",   __func__, hparams.f_norm_eps);
-        LLAMA_LOG_INFO("%s: f_norm_rms_eps        = %.1e\n",   __func__, hparams.f_norm_rms_eps);
-        LLAMA_LOG_INFO("%s: f_clamp_kqv           = %.1e\n",   __func__, hparams.f_clamp_kqv);
-        LLAMA_LOG_INFO("%s: f_max_alibi_bias      = %.1e\n",   __func__, hparams.f_max_alibi_bias);
-        LLAMA_LOG_INFO("%s: f_logit_scale         = %.1e\n",   __func__, hparams.f_logit_scale);
-        LLAMA_LOG_INFO("%s: f_attn_scale          = %.1e\n",   __func__, hparams.f_attention_scale);
-        LLAMA_LOG_INFO("%s: n_ff                  = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
-        LLAMA_LOG_INFO("%s: n_expert              = %u\n",     __func__, hparams.n_expert);
-        LLAMA_LOG_INFO("%s: n_expert_used         = %u\n",     __func__, hparams.n_expert_used);
-        LLAMA_LOG_INFO("%s: n_expert_groups       = %d\n",     __func__, hparams.n_expert_groups);
-        LLAMA_LOG_INFO("%s: n_group_used          = %d\n",     __func__, hparams.n_group_used);
-        LLAMA_LOG_INFO("%s: causal attn           = %d\n",     __func__, hparams.causal_attn);
-        LLAMA_LOG_INFO("%s: pooling type          = %d\n",     __func__, hparams.pooling_type);
-        LLAMA_LOG_INFO("%s: rope type             = %d\n",     __func__, hparams.rope_type);
-        LLAMA_LOG_INFO("%s: rope scaling          = %s\n",     __func__, rope_scaling_type.c_str());
-        LLAMA_LOG_INFO("%s: freq_base_train       = %.1f\n",   __func__, hparams.rope_freq_base_train);
-        LLAMA_LOG_INFO("%s: freq_scale_train      = %g\n",     __func__, hparams.rope_freq_scale_train);
-        if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
-            LLAMA_LOG_INFO("%s: freq_base_swa         = %.1f\n",   __func__, hparams.rope_freq_base_train_swa);
-            LLAMA_LOG_INFO("%s: freq_scale_swa        = %g\n",     __func__, hparams.rope_freq_scale_train_swa);
-            LLAMA_LOG_INFO("%s: n_embd_head_k_swa     = %u\n",     __func__, hparams.n_embd_head_k_swa);
-            LLAMA_LOG_INFO("%s: n_embd_head_v_swa     = %u\n",     __func__, hparams.n_embd_head_v_swa);
-            LLAMA_LOG_INFO("%s: n_rot_swa             = %u\n",     __func__, hparams.n_rot_swa);
-        }
-        LLAMA_LOG_INFO("%s: n_ctx_orig_yarn       = %u\n",     __func__, hparams.n_ctx_orig_yarn);
-        LLAMA_LOG_INFO("%s: rope_yarn_log_mul     = %.4f\n",   __func__, hparams.rope_yarn_log_mul);
-        LLAMA_LOG_INFO("%s: rope_finetuned        = %s\n",     __func__, hparams.rope_finetuned ? "yes" : "unknown");
-        // MRoPE (Multi-axis Rotary Position Embedding) sections
-        if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
-            LLAMA_LOG_INFO("%s: mrope sections        = [%d, %d, %d, %d]\n", __func__, s[0], s[1], s[2], s[3]);
-        }
-        if (!classifier_labels.empty()) {
-            LLAMA_LOG_INFO("%s: n_cls_out             = %u\n", __func__, hparams.n_cls_out);
-
-            size_t i = 0;
-            for (const auto & label : classifier_labels) {
-                LLAMA_LOG_INFO("%s: cls_label[%2zu]         = %s\n", __func__, i++, label.c_str());
-            }
-        }
-
-        if (arch == LLM_ARCH_MAMBA ||
-                arch == LLM_ARCH_MAMBA2 ||
-                arch == LLM_ARCH_JAMBA ||
-                arch == LLM_ARCH_FALCON_H1 ||
-                arch == LLM_ARCH_PLAMO2 ||
-                arch == LLM_ARCH_GRANITE_HYBRID ||
-                arch == LLM_ARCH_QWEN3NEXT ||
-                arch == LLM_ARCH_QWEN35 ||
-                arch == LLM_ARCH_QWEN35MOE ||
-                arch == LLM_ARCH_NEMOTRON_H ||
-                arch == LLM_ARCH_NEMOTRON_H_MOE) {
-            LLAMA_LOG_INFO("%s: ssm_d_conv            = %u\n",     __func__, hparams.ssm_d_conv);
-            LLAMA_LOG_INFO("%s: ssm_d_inner           = %u\n",     __func__, hparams.ssm_d_inner);
-            LLAMA_LOG_INFO("%s: ssm_d_state           = %u\n",     __func__, hparams.ssm_d_state);
-            LLAMA_LOG_INFO("%s: ssm_dt_rank           = %u\n",     __func__, hparams.ssm_dt_rank);
-            LLAMA_LOG_INFO("%s: ssm_n_group           = %u\n",     __func__, hparams.ssm_n_group);
-            LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms        = %d\n",     __func__, hparams.ssm_dt_b_c_rms);
-        }
-
-        LLAMA_LOG_INFO("%s: model type            = %s\n",     __func__, type_name().c_str());
-        if (pimpl->n_elements >= 1e12) {
-            LLAMA_LOG_INFO("%s: model params          = %.2f T\n", __func__, pimpl->n_elements*1e-12);
-        } else if (pimpl->n_elements >= 1e9) {
-            LLAMA_LOG_INFO("%s: model params          = %.2f B\n", __func__, pimpl->n_elements*1e-9);
-        } else if (pimpl->n_elements >= 1e6) {
-            LLAMA_LOG_INFO("%s: model params          = %.2f M\n", __func__, pimpl->n_elements*1e-6);
-        } else {
-            LLAMA_LOG_INFO("%s: model params          = %.2f K\n", __func__, pimpl->n_elements*1e-3);
-        }
-
-        // general kv
-        LLAMA_LOG_INFO("%s: general.name          = %s\n",    __func__, name.c_str());
-
-        if (arch == LLM_ARCH_DEEPSEEK) {
-            LLAMA_LOG_INFO("%s: n_layer_dense_lead    = %d\n",     __func__, hparams.n_layer_dense_lead);
-            LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
-            LLAMA_LOG_INFO("%s: n_expert_shared       = %d\n",     __func__, hparams.n_expert_shared);
-            LLAMA_LOG_INFO("%s: expert_weights_scale  = %.1f\n",   __func__, hparams.expert_weights_scale);
-        }
-
-        if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_DEEPSEEK2OCR || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_MISTRAL4) {
-            LLAMA_LOG_INFO("%s: n_layer_dense_lead    = %d\n",     __func__, hparams.n_layer_dense_lead);
-            LLAMA_LOG_INFO("%s: n_lora_q              = %d\n",     __func__, hparams.n_lora_q);
-            LLAMA_LOG_INFO("%s: n_lora_kv             = %d\n",     __func__, hparams.n_lora_kv);
-            LLAMA_LOG_INFO("%s: n_embd_head_k_mla     = %d\n",     __func__, hparams.n_embd_head_k_mla());
-            LLAMA_LOG_INFO("%s: n_embd_head_v_mla     = %d\n",     __func__, hparams.n_embd_head_v_mla());
-            LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
-            LLAMA_LOG_INFO("%s: n_expert_shared       = %d\n",     __func__, hparams.n_expert_shared);
-            LLAMA_LOG_INFO("%s: expert_weights_scale  = %.1f\n",   __func__, hparams.expert_weights_scale);
-            LLAMA_LOG_INFO("%s: expert_weights_norm   = %d\n",     __func__, hparams.expert_weights_norm);
-            LLAMA_LOG_INFO("%s: expert_gating_func    = %s\n",     __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
-        }
-
-        if (arch == LLM_ARCH_QWEN2MOE) {
-            LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
-            LLAMA_LOG_INFO("%s: n_ff_shexp            = %d\n",     __func__, hparams.n_ff_shexp);
-        }
-
-        if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) {
-            LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
-        }
-
-        if (arch == LLM_ARCH_MINICPM ||
-                arch == LLM_ARCH_GRANITE ||
-                arch == LLM_ARCH_GRANITE_MOE ||
-                arch == LLM_ARCH_GRANITE_HYBRID ||
-                arch == LLM_ARCH_NEMOTRON_H_MOE) {
-            LLAMA_LOG_INFO("%s: f_embedding_scale     = %f\n", __func__, hparams.f_embedding_scale);
-            LLAMA_LOG_INFO("%s: f_residual_scale      = %f\n", __func__, hparams.f_residual_scale);
-            LLAMA_LOG_INFO("%s: f_attention_scale     = %f\n", __func__, hparams.f_attention_scale);
-            LLAMA_LOG_INFO("%s: n_ff_shexp            = %d\n", __func__, hparams.n_ff_shexp);
-        }
-
-        if (arch == LLM_ARCH_BAILINGMOE) {
-            LLAMA_LOG_INFO("%s: n_layer_dense_lead    = %d\n",     __func__, hparams.n_layer_dense_lead);
-            LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
-            LLAMA_LOG_INFO("%s: n_expert_shared       = %d\n",     __func__, hparams.n_expert_shared);
-            LLAMA_LOG_INFO("%s: expert_weights_scale  = %.1f\n",   __func__, hparams.expert_weights_scale);
-            LLAMA_LOG_INFO("%s: expert_weights_norm   = %d\n",     __func__, hparams.expert_weights_norm);
-        }
-
-        if (arch == LLM_ARCH_BAILINGMOE2) {
-            LLAMA_LOG_INFO("%s: n_layer_dense_lead    = %d\n",     __func__, hparams.n_layer_dense_lead);
-            LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
-            LLAMA_LOG_INFO("%s: n_ff_shexp            = %d\n",     __func__, hparams.n_ff_shexp);
-            LLAMA_LOG_INFO("%s: n_expert_shared       = %d\n",     __func__, hparams.n_expert_shared);
-            LLAMA_LOG_INFO("%s: expert_weights_scale  = %.1f\n",   __func__, hparams.expert_weights_scale);
-            LLAMA_LOG_INFO("%s: expert_weights_norm   = %d\n",     __func__, hparams.expert_weights_norm);
-            LLAMA_LOG_INFO("%s: expert_gating_func    = %s\n",     __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
-            LLAMA_LOG_INFO("%s: nextn_predict_layers  = %d\n",     __func__, hparams.nextn_predict_layers);
-        }
-
-        if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
-            LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
-            LLAMA_LOG_INFO("%s: expert_gating_func    = %s\n",     __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
-        }
-
-        if (arch == LLM_ARCH_GROVEMOE) {
-            LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
-            LLAMA_LOG_INFO("%s: n_ff_chexp            = %d\n",     __func__, hparams.n_ff_chexp);
-            LLAMA_LOG_INFO("%s: n_group_experts       = %d\n",     __func__, hparams.n_group_experts);
-            LLAMA_LOG_INFO("%s: expert_group_scale    = %.2f\n",   __func__, hparams.expert_group_scale);
-        }
-    }
-
-    vocab.print_info();
-}
-
-ggml_backend_dev_t llama_model::dev_layer(int il) const {
-    return pimpl->dev_layer.at(il).dev;
-}
-
-ggml_backend_dev_t llama_model::dev_output() const {
-    return pimpl->dev_output.dev;
-}
-
-template<typename F>
-static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
-    ggml_init_params params = {
-        /*.mem_size   =*/ ggml_tensor_overhead()*8,
-        /*.mem_buffer =*/ NULL,
-        /*.no_alloc   =*/ true,
-    };
-
-    ggml_context_ptr ctx { ggml_init(params) };
-    if (!ctx) {
-        throw std::runtime_error(format("failed to create ggml context"));
-    }
-
-    ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
-    ggml_tensor * op_tensor = fn(ctx.get());
-    for (int i = 0; i < GGML_MAX_SRC; i++) {
-        if (op_tensor->src[i] != nullptr) {
-            assert(op_tensor->src[i]->buffer == nullptr);
-            op_tensor->src[i]->buffer = buf.get();
-        }
-    }
-
-    bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
-
-    return op_supported;
-}
-
-template<typename F>
-static ggml_backend_buffer_type_t select_buft(const buft_list_t & buft_list, const F & fn) {
-    for (const auto & cur : buft_list) {
-        ggml_backend_dev_t cur_dev = cur.first;
-        ggml_backend_buffer_type_t cur_buft = cur.second;
-        if (buft_supported(cur_buft, cur_dev, fn)) {
-            return cur_buft;
-        }
-    }
-
-    throw std::runtime_error(format("no suitable buffer type found"));
-}
-
-ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
-    return ::select_buft(
-            *pimpl->dev_layer.at(il).buft_list,
-            [&](ggml_context * ctx) {
-                ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
-                ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
-                return ggml_add(ctx, cur, layer_dir);
-            });
-}
-
-bool llama_model::has_tensor_overrides() const {
-    return pimpl->has_tensor_overrides;
-}
-
-const ggml_tensor * llama_model::get_tensor(const char * name) const {
-    auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
-            [name](const std::pair<std::string, ggml_tensor *> & it) {
-                return it.first == name;
-            });
-    if (it == tensors_by_name.end()) {
-        return nullptr;
-    }
-
-    return it->second;
-}
-
-float llama_model::get_rope_freq_base (const llama_cparams & cparams, int il) const {
-    return hparams.is_swa(il) ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
-}
-
-float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) const {
-    return hparams.is_swa(il) ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
-}
-
-ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const {
-    const uint32_t n_ctx_seq = cparams.n_ctx_seq;
-
-    // choose long/short freq factors based on the context size
-    if (layers[il].rope_freqs != nullptr) {
-        return layers[il].rope_freqs;
-    }
-
-    if (n_ctx_seq > hparams.n_ctx_orig_yarn) {
-        return layers[il].rope_long;
-    }
-
-    return layers[il].rope_short;
-}
-
-llama_memory_i * llama_model::create_memory(const llama_memory_params & params, const llama_cparams & cparams) const {
-    llama_memory_i * res;
-
-    switch (arch) {
-        // Models that need specific instantiation should be handled in the
-        // switch statement
-        case LLM_ARCH_BERT:
-        case LLM_ARCH_JINA_BERT_V2:
-        case LLM_ARCH_JINA_BERT_V3:
-        case LLM_ARCH_NOMIC_BERT:
-        case LLM_ARCH_NOMIC_BERT_MOE:
-        case LLM_ARCH_NEO_BERT:
-        case LLM_ARCH_EUROBERT:
-        case LLM_ARCH_WAVTOKENIZER_DEC:
-        case LLM_ARCH_MODERN_BERT:
-        case LLM_ARCH_GEMMA_EMBEDDING:
-        case LLM_ARCH_DREAM:
-        case LLM_ARCH_LLADA:
-        case LLM_ARCH_LLADA_MOE:
-        case LLM_ARCH_RND1:
-            {
-                res = nullptr;
-            } break;
-        // Models that need standard caching should rely on recurrent/hybrid
-        // checks
-        default:
-            {
-                if (llm_arch_is_recurrent(arch)) {
-                    res = new llama_memory_recurrent(
-                            *this,
-                            GGML_TYPE_F32,
-                            GGML_TYPE_F32,
-                            cparams.offload_kqv,
-                            std::max((uint32_t) 1, cparams.n_seq_max),
-                            cparams.n_seq_max,
-                            nullptr);
-                } else if (llm_arch_is_hybrid(arch)) {
-                    // The main difference between hybrid architectures is the
-                    // layer filters, so pick the right one here
-                    llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
-                    llama_memory_hybrid::layer_filter_cb filter_recr = nullptr;
-                    if (arch == LLM_ARCH_FALCON_H1) {
-                        filter_attn = [&](int32_t) { return true; };
-                        filter_recr = [&](int32_t) { return true; };
-                    } else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) {
-                        filter_attn = [&](int32_t il) {
-                            return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
-                        };
-                        filter_recr = [&](int32_t il) {
-                            return hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
-                        };
-                    }
-
-                    if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
-                        // Use hybrid-iswa for hybrid models with SWA
-                        res = new llama_memory_hybrid_iswa(
-                            /* model             */ *this,
-                            /* attn_type_k       */ params.type_k,
-                            /* attn_type_v       */ params.type_v,
-                            /* attn_v_trans      */ !cparams.flash_attn,
-                            /* attn_swa_full     */ params.swa_full,
-                            /* attn_kv_size      */ cparams.n_ctx_seq,
-                            /* attn_n_ubatch     */ cparams.n_ubatch,
-                            /* attn_n_pad        */ 1,
-                            /* recurrent_type_r  */ GGML_TYPE_F32,
-                            /* recurrent_type_s  */ GGML_TYPE_F32,
-                            /* recurrent_rs_size */ std::max((uint32_t) 1, cparams.n_seq_max),
-                            /* n_seq_max         */ cparams.n_seq_max,
-                            /* offload           */ cparams.offload_kqv,
-                            /* unified           */ cparams.kv_unified,
-                            /* filter_attn       */ std::move(filter_attn),
-                            /* filter_recr       */ std::move(filter_recr));
-                    } else {
-                        res = new llama_memory_hybrid(
-                            /* model             */ *this,
-                            /* attn_type_k       */ params.type_k,
-                            /* attn_type_v       */ params.type_v,
-                            /* attn_v_trans      */ !cparams.flash_attn,
-                            /* attn_kv_size      */ cparams.n_ctx_seq,
-                            /* attn_n_pad        */ 1,
-                            /* attn_n_swa        */ hparams.n_swa,
-                            /* attn_swa_type     */ hparams.swa_type,
-                            /* recurrent_type_k  */ GGML_TYPE_F32,
-                            /* recurrent_type_v  */ GGML_TYPE_F32,
-                            /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
-                            /* n_seq_max         */ cparams.n_seq_max,
-                            /* offload           */ cparams.offload_kqv,
-                            /* unified           */ cparams.kv_unified,
-                            /* filter_attn       */ std::move(filter_attn),
-                            /* filter_recr       */ std::move(filter_recr));
-                    }
-                } else {
-                    llama_memory_i::layer_reuse_cb reuse = nullptr;
-
-                    if (arch == LLM_ARCH_GEMMA3N || arch == LLM_ARCH_GEMMA4) {
-                        reuse = [&](int32_t il) {
-                            if (il >= (int32_t) hparams.n_layer_kv_from_start) {
-                                return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1);
-                            }
-
-                            return -1;
-                        };
-                    }
-
-                    if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
-                        GGML_ASSERT(hparams.is_swa_any());
-
-                        res = new llama_kv_cache_iswa(
-                                *this,
-                                params.type_k,
-                                params.type_v,
-                                !cparams.flash_attn,
-                                cparams.offload_kqv,
-                                params.swa_full,
-                                cparams.kv_unified,
-                                cparams.n_ctx_seq,
-                                cparams.n_seq_max,
-                                cparams.n_ubatch,
-                                1,
-                                nullptr,
-                                reuse);
-                    } else {
-                        GGML_ASSERT(!hparams.is_swa_any());
-
-                        res = new llama_kv_cache(
-                                *this,
-                                params.type_k,
-                                params.type_v,
-                                !cparams.flash_attn,
-                                cparams.offload_kqv,
-                                cparams.kv_unified,
-                                cparams.n_ctx_seq,
-                                cparams.n_seq_max,
-                                1,
-                                hparams.n_swa,
-                                hparams.swa_type,
-                                nullptr,
-                                nullptr);
-                    }
-                }
-            }
-    }
-
-    return res;
-}
-
-ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
-    std::unique_ptr<llm_graph_context> llm;
-
-    switch (arch) {
-        case LLM_ARCH_LLAMA:
-            {
-                llm = std::make_unique<llm_build_llama<false>>(*this, params);
-            } break;
-        case LLM_ARCH_LLAMA4:
-            {
-                if (hparams.swa_type == LLAMA_SWA_TYPE_NONE) {
-                    llm = std::make_unique<llm_build_llama4<false>>(*this, params);
-                } else {
-                    llm = std::make_unique<llm_build_llama4<true>>(*this, params);
-                }
-            } break;
-        case LLM_ARCH_LLAMA_EMBED:
-            {
-                llm = std::make_unique<llm_build_llama<true>>(*this, params);
-            } break;
-        case LLM_ARCH_MAINCODER:
-            {
-                llm = std::make_unique<llm_build_maincoder>(*this, params);
-            } break;
-        case LLM_ARCH_DECI:
-            {
-                llm = std::make_unique<llm_build_deci>(*this, params);
-            } break;
-        case LLM_ARCH_BAICHUAN:
-            {
-                llm = std::make_unique<llm_build_baichuan>(*this, params);
-            } break;
-        case LLM_ARCH_FALCON:
-            {
-                llm = std::make_unique<llm_build_falcon>(*this, params);
-            } break;
-        case LLM_ARCH_GROK:
-            {
-                llm = std::make_unique<llm_build_grok>(*this, params);
-            } break;
-        case LLM_ARCH_STARCODER:
-            {
-                llm = std::make_unique<llm_build_starcoder>(*this, params);
-            } break;
-        case LLM_ARCH_REFACT:
-            {
-                llm = std::make_unique<llm_build_refact>(*this, params);
-            } break;
-        case LLM_ARCH_BERT:
-        case LLM_ARCH_JINA_BERT_V2:
-        case LLM_ARCH_JINA_BERT_V3:
-        case LLM_ARCH_NOMIC_BERT:
-        case LLM_ARCH_NOMIC_BERT_MOE:
-            {
-                llm = std::make_unique<llm_build_bert>(*this, params);
-            } break;
-        case LLM_ARCH_MODERN_BERT:
-            {
-                llm = std::make_unique<llm_build_modern_bert>(*this, params);
-            } break;
-        case LLM_ARCH_NEO_BERT:
-            {
-                llm = std::make_unique<llm_build_neo_bert>(*this, params);
-            } break;
-        case LLM_ARCH_EUROBERT:
-            {
-                llm = std::make_unique<llm_build_eurobert>(*this, params);
-            } break;
-        case LLM_ARCH_BLOOM:
-            {
-                llm = std::make_unique<llm_build_bloom>(*this, params);
-            } break;
-        case LLM_ARCH_MPT:
-            {
-                llm = std::make_unique<llm_build_mpt>(*this, params);
-            } break;
-        case LLM_ARCH_STABLELM:
-            {
-                llm = std::make_unique<llm_build_stablelm>(*this, params);
-            } break;
-        case LLM_ARCH_QWEN:
-            {
-                llm = std::make_unique<llm_build_qwen>(*this, params);
-            } break;
-        case LLM_ARCH_QWEN2:
-            {
-                llm = std::make_unique<llm_build_qwen2>(*this, params);
-            } break;
-        case LLM_ARCH_DREAM:
-            {
-                llm = std::make_unique<llm_build_dream>(*this, params);
-            } break;
-        case LLM_ARCH_LLADA:
-            {
-                llm = std::make_unique<llm_build_llada>(*this, params);
-            } break;
-        case LLM_ARCH_LLADA_MOE:
-            {
-                llm = std::make_unique<llm_build_llada_moe>(*this, params);
-            } break;
-        case LLM_ARCH_RND1:
-            {
-                llm = std::make_unique<llm_build_rnd1>(*this, params);
-            } break;
-        case LLM_ARCH_QWEN2VL:
-            {
-                llm = std::make_unique<llm_build_qwen2vl>(*this, params);
-            } break;
-        case LLM_ARCH_QWEN2MOE:
-            {
-                llm = std::make_unique<llm_build_qwen2moe>(*this, params);
-            } break;
-        case LLM_ARCH_QWEN3:
-            {
-                llm = std::make_unique<llm_build_qwen3>(*this, params);
-            } break;
-        case LLM_ARCH_QWEN3MOE:
-            {
-                llm = std::make_unique<llm_build_qwen3moe>(*this, params);
-            } break;
-        case LLM_ARCH_QWEN3VL:
-            {
-                llm = std::make_unique<llm_build_qwen3vl>(*this, params);
-            } break;
-        case LLM_ARCH_QWEN3VLMOE:
-            {
-                llm = std::make_unique<llm_build_qwen3vlmoe>(*this, params);
-            } break;
-        case LLM_ARCH_PHI2:
-            {
-                llm = std::make_unique<llm_build_phi2>(*this, params);
-            } break;
-        case LLM_ARCH_PHI3:
-        case LLM_ARCH_PHIMOE:
-            {
-                if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
-                    llm = std::make_unique<llm_build_phi3<true>> (*this, params);
-                } else {
-                    llm = std::make_unique<llm_build_phi3<false>>(*this, params);
-                }
-            } break;
-        case LLM_ARCH_PLAMO:
-            {
-                llm = std::make_unique<llm_build_plamo>(*this, params);
-            } break;
-        case LLM_ARCH_PLAMO2:
-            {
-                llm = std::make_unique<llm_build_plamo2>(*this, params);
-            } break;
-        case LLM_ARCH_PLAMO3:
-            {
-                if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
-                    llm = std::make_unique<llm_build_plamo3<true>> (*this, params);
-                } else {
-                    llm = std::make_unique<llm_build_plamo3<false>>(*this, params);
-                }
-            } break;
-        case LLM_ARCH_GPT2:
-            {
-                llm = std::make_unique<llm_build_gpt2>(*this, params);
-            } break;
-        case LLM_ARCH_CODESHELL:
-            {
-                llm = std::make_unique<llm_build_codeshell>(*this, params);
-            } break;
-        case LLM_ARCH_ORION:
-            {
-                llm = std::make_unique<llm_build_orion>(*this, params);
-            } break;
-        case LLM_ARCH_INTERNLM2:
-            {
-                llm = std::make_unique<llm_build_internlm2>(*this, params);
-            } break;
-        case LLM_ARCH_MINICPM3:
-            {
-                llm = std::make_unique<llm_build_minicpm3>(*this, params);
-            } break;
-        case LLM_ARCH_GEMMA:
-            {
-                llm = std::make_unique<llm_build_gemma>(*this, params);
-            } break;
-        case LLM_ARCH_GEMMA2:
-            {
-                llm = std::make_unique<llm_build_gemma2_iswa>(*this, params);
-            } break;
-        case LLM_ARCH_GEMMA3:
-            {
-                if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
-                    llm = std::make_unique<llm_build_gemma3<true>>(*this, params);
-                } else {
-                    llm = std::make_unique<llm_build_gemma3<false>>(*this, params);
-                }
-            } break;
-        case LLM_ARCH_GEMMA3N:
-            {
-                llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params);
-            } break;
-        case LLM_ARCH_GEMMA4:
-            {
-                llm = std::make_unique<llm_build_gemma4_iswa>(*this, params);
-            } break;
-        case LLM_ARCH_GEMMA_EMBEDDING:
-            {
-                llm = std::make_unique<llm_build_gemma_embedding>(*this, params);
-            } break;
-        case LLM_ARCH_STARCODER2:
-            {
-                llm = std::make_unique<llm_build_starcoder2>(*this, params);
-            } break;
-        case LLM_ARCH_MAMBA:
-        case LLM_ARCH_MAMBA2:
-            {
-                llm = std::make_unique<llm_build_mamba>(*this, params);
-            } break;
-        case LLM_ARCH_JAMBA:
-            {
-                llm = std::make_unique<llm_build_jamba>(*this, params);
-            } break;
-        case LLM_ARCH_XVERSE:
-            {
-                llm = std::make_unique<llm_build_xverse>(*this, params);
-            } break;
-        case LLM_ARCH_COMMAND_R:
-            {
-                llm = std::make_unique<llm_build_command_r>(*this, params);
-            } break;
-        case LLM_ARCH_COHERE2:
-            {
-                llm = std::make_unique<llm_build_cohere2_iswa>(*this, params);
-            } break;
-        case LLM_ARCH_DBRX:
-            {
-                llm = std::make_unique<llm_build_dbrx>(*this, params);
-            } break;
-        case LLM_ARCH_OLMO:
-            {
-                llm = std::make_unique<llm_build_olmo>(*this, params);
-            } break;
-        case LLM_ARCH_OLMO2:
-            {
-                if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
-                    llm = std::make_unique<llm_build_olmo2<true>>(*this, params);
-                } else {
-                    llm = std::make_unique<llm_build_olmo2<false>>(*this, params);
-                }
-            } break;
-        case LLM_ARCH_OLMOE:
-            {
-                llm = std::make_unique<llm_build_olmoe>(*this, params);
-            } break;
-        case LLM_ARCH_OPENELM:
-            {
-                llm = std::make_unique<llm_build_openelm>(*this, params);
-            } break;
-        case LLM_ARCH_GPTNEOX:
-            {
-                llm = std::make_unique<llm_build_gptneox>(*this, params);
-            } break;
-        case LLM_ARCH_ARCTIC:
-            {
-                llm = std::make_unique<llm_build_arctic>(*this, params);
-            } break;
-        case LLM_ARCH_DEEPSEEK:
-            {
-                llm = std::make_unique<llm_build_deepseek>(*this, params);
-            } break;
-        case LLM_ARCH_DEEPSEEK2:
-        case LLM_ARCH_DEEPSEEK2OCR:
-        case LLM_ARCH_GLM_DSA:
-        case LLM_ARCH_MISTRAL4:
-            {
-                llm = std::make_unique<llm_build_deepseek2>(*this, params);
-            } break;
-        case LLM_ARCH_CHATGLM:
-            {
-                llm = std::make_unique<llm_build_chatglm>(*this, params);
-            } break;
-        case LLM_ARCH_GLM4:
-            {
-                llm = std::make_unique<llm_build_glm4>(*this, params);
-            } break;
-        case LLM_ARCH_GLM4_MOE:
-            {
-                llm = std::make_unique<llm_build_glm4_moe>(*this, params);
-            } break;
-        case LLM_ARCH_BITNET:
-            {
-                llm = std::make_unique<llm_build_bitnet>(*this, params);
-            } break;
-        case LLM_ARCH_T5:
-            {
-                switch (params.gtype) {
-                    case LLM_GRAPH_TYPE_ENCODER:
-                        llm = std::make_unique<llm_build_t5<true>>(*this, params);
-                        break;
-                    case LLM_GRAPH_TYPE_DEFAULT:
-                    case LLM_GRAPH_TYPE_DECODER:
-                        llm = std::make_unique<llm_build_t5<false>>(*this, params);
-                        break;
-                    default:
-                        GGML_ABORT("invalid graph type");
-                };
-            } break;
-        case LLM_ARCH_T5ENCODER:
-            {
-                llm = std::make_unique<llm_build_t5encoder>(*this, params);
-            } break;
-        case LLM_ARCH_JAIS:
-            {
-                llm = std::make_unique<llm_build_jais>(*this, params);
-            } break;
-        case LLM_ARCH_JAIS2:
-            {
-                llm = std::make_unique<llm_build_jais2>(*this, params);
-            } break;
-        case LLM_ARCH_NEMOTRON:
-            {
-                llm = std::make_unique<llm_build_nemotron>(*this, params);
-            } break;
-        case LLM_ARCH_NEMOTRON_H:
-        case LLM_ARCH_NEMOTRON_H_MOE:
-            {
-                llm = std::make_unique<llm_build_nemotron_h>(*this, params);
-            } break;
-        case LLM_ARCH_EXAONE:
-            {
-                llm = std::make_unique<llm_build_exaone>(*this, params);
-            } break;
-        case LLM_ARCH_EXAONE4:
-            {
-                if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
-                    llm = std::make_unique<llm_build_exaone4<true>>(*this, params);
-                } else {
-                    llm = std::make_unique<llm_build_exaone4<false>>(*this, params);
-                }
-            } break;
-        case LLM_ARCH_EXAONE_MOE:
-            {
-                llm = std::make_unique<llm_build_exaone_moe>(*this, params);
-            } break;
-        case LLM_ARCH_RWKV6:
-            {
-                llm = std::make_unique<llm_build_rwkv6>(*this, params);
-            } break;
-        case LLM_ARCH_RWKV6QWEN2:
-            {
-                llm = std::make_unique<llm_build_rwkv6qwen2>(*this, params);
-            } break;
-        case LLM_ARCH_RWKV7:
-            {
-                llm = std::make_unique<llm_build_rwkv7>(*this, params);
-            } break;
-        case LLM_ARCH_ARWKV7:
-            {
-                llm = std::make_unique<llm_build_arwkv7>(*this, params);
-            } break;
-        case LLM_ARCH_GRANITE:
-        case LLM_ARCH_GRANITE_MOE:
-        case LLM_ARCH_MINICPM:
-            {
-                llm = std::make_unique<llm_build_granite>(*this, params);
-            } break;
-        case LLM_ARCH_GRANITE_HYBRID:
-            {
-                llm = std::make_unique<llm_build_granite_hybrid>(*this, params);
-            } break;
-        case LLM_ARCH_CHAMELEON:
-            {
-                llm = std::make_unique<llm_build_chameleon>(*this, params);
-            } break;
+    // choose long/short freq factors based on the context size
+    if (layers[il].rope_freqs != nullptr) {
+        return layers[il].rope_freqs;
+    }
+
+    if (n_ctx_seq > hparams.n_ctx_orig_yarn) {
+        return layers[il].rope_long;
+    }
+
+    return layers[il].rope_short;
+}
+
+llama_memory_i * llama_model::create_memory(const llama_memory_params & params, const llama_cparams & cparams) const {
+    llama_memory_i * res;
+
+    switch (arch) {
+        // Models that need specific instantiation should be handled in the
+        // switch statement
+        case LLM_ARCH_BERT:
+        case LLM_ARCH_JINA_BERT_V2:
+        case LLM_ARCH_JINA_BERT_V3:
+        case LLM_ARCH_NOMIC_BERT:
+        case LLM_ARCH_NOMIC_BERT_MOE:
+        case LLM_ARCH_NEO_BERT:
+        case LLM_ARCH_EUROBERT:
         case LLM_ARCH_WAVTOKENIZER_DEC:
+        case LLM_ARCH_MODERN_BERT:
+        case LLM_ARCH_GEMMA_EMBEDDING:
+        case LLM_ARCH_DREAM:
+        case LLM_ARCH_LLADA:
+        case LLM_ARCH_LLADA_MOE:
+        case LLM_ARCH_RND1:
             {
-                llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
-            } break;
-        case LLM_ARCH_PLM:
-            {
-                llm = std::make_unique<llm_build_plm>(*this, params);
-            } break;
-        case LLM_ARCH_BAILINGMOE:
-            {
-                llm = std::make_unique<llm_build_bailingmoe>(*this, params);
-            } break;
-        case LLM_ARCH_BAILINGMOE2:
-            {
-                llm = std::make_unique<llm_build_bailingmoe2>(*this, params);
-            } break;
-        case LLM_ARCH_SEED_OSS:
-            {
-                llm = std::make_unique<llm_build_seed_oss>(*this, params);
-            } break;
-        case LLM_ARCH_DOTS1:
-            {
-                llm = std::make_unique<llm_build_dots1>(*this, params);
-            } break;
-        case LLM_ARCH_ARCEE:
-            {
-                llm = std::make_unique<llm_build_arcee>(*this, params);
-            } break;
-        case LLM_ARCH_AFMOE:
-            {
-                llm = std::make_unique<llm_build_afmoe>(*this, params);
-            } break;
-        case LLM_ARCH_ERNIE4_5:
-            {
-                llm = std::make_unique<llm_build_ernie4_5>(*this, params);
-            } break;
-        case LLM_ARCH_ERNIE4_5_MOE:
-            {
-                llm = std::make_unique<llm_build_ernie4_5_moe>(*this, params);
-            } break;
-        case LLM_ARCH_PADDLEOCR:
-            {
-                llm = std::make_unique<llm_build_paddleocr>(*this, params);
-            } break;
-        case LLM_ARCH_HUNYUAN_MOE:
-            {
-                llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
-            } break;
-        case LLM_ARCH_HUNYUAN_VL:
-        case LLM_ARCH_HUNYUAN_DENSE:
-            {
-                llm = std::make_unique<llm_build_hunyuan_dense>(*this, params);
-            } break;
-        case LLM_ARCH_SMOLLM3:
-            {
-                llm = std::make_unique<llm_build_smollm3>(*this, params);
-            } break;
-        case LLM_ARCH_OPENAI_MOE:
-            {
-                llm = std::make_unique<llm_build_openai_moe_iswa>(*this, params);
-            } break;
-        case LLM_ARCH_FALCON_H1:
-            {
-                llm = std::make_unique<llm_build_falcon_h1>(*this, params);
-            } break;
-        case LLM_ARCH_LFM2:
-        case LLM_ARCH_LFM2MOE:
-            {
-                if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
-                    llm = std::make_unique<llm_build_lfm2<true>>(*this, params);
-                } else {
-                    llm = std::make_unique<llm_build_lfm2<false>>(*this, params);
-                }
+                res = nullptr;
             } break;
-        case LLM_ARCH_SMALLTHINKER:
+        // Models that need standard caching should rely on recurrent/hybrid
+        // checks
+        default:
             {
-                if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
-                    llm = std::make_unique<llm_build_smallthinker<true>> (*this, params);
+                if (llm_arch_is_recurrent(arch)) {
+                    res = new llama_memory_recurrent(
+                            *this,
+                            GGML_TYPE_F32,
+                            GGML_TYPE_F32,
+                            cparams.offload_kqv,
+                            std::max((uint32_t) 1, cparams.n_seq_max),
+                            cparams.n_seq_max,
+                            nullptr);
+                } else if (llm_arch_is_hybrid(arch)) {
+                    // The main difference between hybrid architectures is the
+                    // layer filters, so pick the right one here
+                    llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
+                    llama_memory_hybrid::layer_filter_cb filter_recr = nullptr;
+                    if (arch == LLM_ARCH_FALCON_H1) {
+                        filter_attn = [&](int32_t) { return true; };
+                        filter_recr = [&](int32_t) { return true; };
+                    } else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) {
+                        filter_attn = [&](int32_t il) {
+                            return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
+                        };
+                        filter_recr = [&](int32_t il) {
+                            return hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
+                        };
+                    }
+
+                    if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
+                        // Use hybrid-iswa for hybrid models with SWA
+                        res = new llama_memory_hybrid_iswa(
+                            /* model             */ *this,
+                            /* attn_type_k       */ params.type_k,
+                            /* attn_type_v       */ params.type_v,
+                            /* attn_v_trans      */ !cparams.flash_attn,
+                            /* attn_swa_full     */ params.swa_full,
+                            /* attn_kv_size      */ cparams.n_ctx_seq,
+                            /* attn_n_ubatch     */ cparams.n_ubatch,
+                            /* attn_n_pad        */ 1,
+                            /* recurrent_type_r  */ GGML_TYPE_F32,
+                            /* recurrent_type_s  */ GGML_TYPE_F32,
+                            /* recurrent_rs_size */ std::max((uint32_t) 1, cparams.n_seq_max),
+                            /* n_seq_max         */ cparams.n_seq_max,
+                            /* offload           */ cparams.offload_kqv,
+                            /* unified           */ cparams.kv_unified,
+                            /* filter_attn       */ std::move(filter_attn),
+                            /* filter_recr       */ std::move(filter_recr));
+                    } else {
+                        res = new llama_memory_hybrid(
+                            /* model             */ *this,
+                            /* attn_type_k       */ params.type_k,
+                            /* attn_type_v       */ params.type_v,
+                            /* attn_v_trans      */ !cparams.flash_attn,
+                            /* attn_kv_size      */ cparams.n_ctx_seq,
+                            /* attn_n_pad        */ 1,
+                            /* attn_n_swa        */ hparams.n_swa,
+                            /* attn_swa_type     */ hparams.swa_type,
+                            /* recurrent_type_k  */ GGML_TYPE_F32,
+                            /* recurrent_type_v  */ GGML_TYPE_F32,
+                            /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
+                            /* n_seq_max         */ cparams.n_seq_max,
+                            /* offload           */ cparams.offload_kqv,
+                            /* unified           */ cparams.kv_unified,
+                            /* filter_attn       */ std::move(filter_attn),
+                            /* filter_recr       */ std::move(filter_recr));
+                    }
                 } else {
-                    llm = std::make_unique<llm_build_smallthinker<false>>(*this, params);
+                    llama_memory_i::layer_reuse_cb reuse = nullptr;
+
+                    if (arch == LLM_ARCH_GEMMA3N || arch == LLM_ARCH_GEMMA4) {
+                        reuse = [&](int32_t il) {
+                            if (il >= (int32_t) hparams.n_layer_kv_from_start) {
+                                return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1);
+                            }
+
+                            return -1;
+                        };
+                    }
+
+                    if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
+                        GGML_ASSERT(hparams.is_swa_any());
+
+                        res = new llama_kv_cache_iswa(
+                                *this,
+                                params.type_k,
+                                params.type_v,
+                                !cparams.flash_attn,
+                                cparams.offload_kqv,
+                                params.swa_full,
+                                cparams.kv_unified,
+                                cparams.n_ctx_seq,
+                                cparams.n_seq_max,
+                                cparams.n_ubatch,
+                                1,
+                                nullptr,
+                                reuse);
+                    } else {
+                        GGML_ASSERT(!hparams.is_swa_any());
+
+                        res = new llama_kv_cache(
+                                *this,
+                                params.type_k,
+                                params.type_v,
+                                !cparams.flash_attn,
+                                cparams.offload_kqv,
+                                cparams.kv_unified,
+                                cparams.n_ctx_seq,
+                                cparams.n_seq_max,
+                                1,
+                                hparams.n_swa,
+                                hparams.swa_type,
+                                nullptr,
+                                nullptr);
+                    }
                 }
-            } break;
-        case LLM_ARCH_GROVEMOE:
-            {
-                llm = std::make_unique<llm_build_grovemoe>(*this, params);
-            } break;
-        case LLM_ARCH_APERTUS:
-            {
-                llm = std::make_unique<llm_build_apertus>(*this, params);
-            } break;
-        case LLM_ARCH_MINIMAX_M2:
-            {
-                llm = std::make_unique<llm_build_minimax_m2>(*this, params);
-            } break;
-        case LLM_ARCH_COGVLM:
-            {
-                llm = std::make_unique<llm_build_cogvlm>(*this, params);
-            } break;
-        case LLM_ARCH_PANGU_EMBED:
-            {
-                llm = std::make_unique<llm_build_pangu_embedded>(*this, params);
-            } break;
-        case LLM_ARCH_QWEN3NEXT:
-            {
-                llm = std::make_unique<llm_build_qwen3next>(*this, params);
-            } break;
-        case LLM_ARCH_QWEN35:
-            {
-                llm = std::make_unique<llm_build_qwen35>(*this, params);
-            } break;
-        case LLM_ARCH_QWEN35MOE:
-            {
-                llm = std::make_unique<llm_build_qwen35moe>(*this, params);
-            } break;
-        case LLM_ARCH_MISTRAL3:
-            {
-                llm = std::make_unique<llm_build_mistral3>(*this, params);
-            } break;
-        case LLM_ARCH_MIMO2:
-            {
-                llm = std::make_unique<llm_build_mimo2_iswa>(*this, params);
-            } break;
-        case LLM_ARCH_KIMI_LINEAR:
-            {
-                llm = std::make_unique<llm_build_kimi_linear>(*this, params);
-            } break;
-        case LLM_ARCH_STEP35:
-            {
-                llm = std::make_unique<llm_build_step35_iswa>(*this, params);
-            } break;
-        default:
-            GGML_ABORT("fatal error");
+            }
     }
 
+    return res;
+}
+
+ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
+    std::unique_ptr<llm_graph_context> llm = build_arch_graph(params);
+
     // add on pooling layer
     llm->build_pooling(cls, cls_b, cls_out, cls_out_b, cls_norm);
 
@@ -9487,3 +2475,43 @@ ggml_backend_dev_t llama_model_get_device(const struct llama_model * model, int
     }
     return model->devices[i].dev;
 }
+
+//
+// llama_model_base
+//
+
+llama_model_base::llama_model_base(const struct llama_model_params & params) : llama_model(params), model(this), tn(model->arch),
+    TENSOR_DUPLICATED     (llama_model_loader::TENSOR_DUPLICATED),
+    TENSOR_NOT_REQUIRED   (llama_model_loader::TENSOR_NOT_REQUIRED),
+    TENSOR_SKIP           (llama_model_loader::TENSOR_SKIP),
+    TENSOR_SKIP_IF_VIRTUAL(llama_model_loader::TENSOR_SKIP_IF_VIRTUAL) {}
+
+ggml_tensor * llama_model_base::create_tensor(const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) {
+    GGML_ASSERT(ml != nullptr);
+    return create_tensor(*ml, tn, ne, flags);
+}
+
+void llama_model_base::create_tensor_gate_up_exps(llama_layer & layer, int bid, int64_t n_embd_, int64_t n_ff_, int64_t n_expert_, int flags) {
+    layer.ffn_gate_up_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_UP_EXPS, "weight", bid), {n_embd_, n_ff_ * 2, n_expert_}, TENSOR_NOT_REQUIRED);
+    if (layer.ffn_gate_up_exps == nullptr) {
+        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", bid), {n_embd_, n_ff_, n_expert_}, flags);
+        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", bid), {n_embd_, n_ff_, n_expert_}, flags);
+    }
+}
+
+void llama_model_base::create_tensor_qkv(llama_layer & layer, int bid,
+        int64_t n_embd_, int64_t n_embd_q_, int64_t n_embd_k_, int64_t n_embd_v_,
+        int flags) {
+    const int64_t n_embd_qkv = n_embd_q_ + n_embd_k_ + n_embd_v_;
+    layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", bid), {n_embd_, n_embd_qkv}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
+    if (layer.wqkv) {
+        layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", bid), {n_embd_qkv}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
+    } else {
+        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", bid), {n_embd_, n_embd_q_}, flags);
+        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", bid), {n_embd_, n_embd_k_}, flags);
+        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", bid), {n_embd_, n_embd_v_}, flags);
+        layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", bid), {n_embd_q_}, TENSOR_NOT_REQUIRED);
+        layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", bid), {n_embd_k_}, TENSOR_NOT_REQUIRED);
+        layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", bid), {n_embd_v_}, TENSOR_NOT_REQUIRED);
+    }
+}
diff --git a/src/llama-model.h b/src/llama-model.h
index 5f101bd63745..d63c689185a9 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -577,14 +577,8 @@ struct llama_model {
     int64_t t_load_us  = 0;
     int64_t t_start_us = 0;
 
-    explicit llama_model(const struct llama_model_params & params);
-    ~llama_model();
-
-    void load_stats  (llama_model_loader & ml);
-    void load_arch   (llama_model_loader & ml);
-    void load_hparams(llama_model_loader & ml);
-    void load_vocab  (llama_model_loader & ml);
-    bool load_tensors(llama_model_loader & ml); // returns false if cancelled by progress_callback
+    explicit llama_model(const llama_model_params & params);
+    virtual ~llama_model();
 
     std::string arch_name() const;
     std::string type_name() const;
@@ -620,21 +614,94 @@ struct llama_model {
 
     ggml_tensor * get_rope_factors(const llama_cparams & cparams, int il) const;
 
-    // TODO: move this to new llm_arch_model_i interface
     llama_memory_i * create_memory(const llama_memory_params & params, const llama_cparams & cparams) const;
 
-    // TODO: move this to new llm_arch_model_i interface
     ggml_cgraph * build_graph(const llm_graph_params & params) const;
 
-private:
+    virtual void load_stats  (llama_model_loader & ml) = 0;
+    virtual void load_hparams(llama_model_loader & ml) = 0;
+    virtual void load_vocab  (llama_model_loader & ml) = 0;
+    virtual bool load_tensors(llama_model_loader & ml) = 0; // returns false if cancelled by progress_callback
+
+    // model must define these
+    virtual void load_arch_hparams(llama_model_loader & ml) = 0;
+    virtual void load_arch_tensors(llama_model_loader & ml) = 0;
+    virtual std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const = 0;
+
+protected:
     llama_model_params params;
 
     struct impl;
     std::unique_ptr<impl> pimpl;
 };
 
+llama_model * llama_model_create(llm_arch arch, const llama_model_params & params);
+llama_model * llama_model_create(llama_model_loader & ml, const llama_model_params & params);
+
+// model must inherit from this
+struct llama_model_base : public llama_model {
+    friend struct llama_model;
+
+    llama_model * model;
+    llama_model_loader * ml = nullptr;
+    const LLM_TN tn;
+
+    // llama_model_loader is not yet defined at this point, so we will set it after construction
+    const int TENSOR_DUPLICATED;
+    const int TENSOR_NOT_REQUIRED;
+    const int TENSOR_SKIP;
+    const int TENSOR_SKIP_IF_VIRTUAL;
+
+    explicit llama_model_base(const llama_model_params & params);
+    virtual ~llama_model_base() = default;
+
+    ggml_tensor * create_tensor(llama_model_loader & ml, const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags);
+
+    // convenience overload of create_tensor that doesn't require llama_model_loader
+    ggml_tensor * create_tensor(const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags);
+
+    // helper: try merged gate_up_exps first, fall back to separate gate and up
+    void create_tensor_gate_up_exps(llama_layer & layer, int bid, int64_t n_embd_,
+                int64_t n_ff_, int64_t n_expert_, int flags);
+
+    // helper: try to load merged qkv first, fall back to separate q, k, v
+    void create_tensor_qkv(llama_layer & layer, int bid,
+                int64_t n_embd_, int64_t n_embd_q_, int64_t n_embd_k_, int64_t n_embd_v_,
+                int flags);
+
+    void load_stats  (llama_model_loader & ml) override;
+    void load_hparams(llama_model_loader & ml) override;
+    void load_vocab  (llama_model_loader & ml) override;
+    bool load_tensors(llama_model_loader & ml) override;
+
+    // model must define these
+    void load_arch_hparams(llama_model_loader & ml) override = 0;
+    void load_arch_tensors(llama_model_loader & ml) override = 0;
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override = 0;
+};
+
 const char * llm_type_name(llm_type type);
 
+// convenience macro for loading local variables for load_tensors() in llama_model_base
+// note: cast to int64_t since we will use these for the tensor dimensions
+#define LLAMA_LOAD_LOCALS \
+    const int     n_layer        = hparams.n_layer;          GGML_UNUSED(n_layer); \
+    const int64_t n_head         = hparams.n_head();         GGML_UNUSED(n_head); \
+    const int64_t n_head_kv      = hparams.n_head_kv();      GGML_UNUSED(n_head_kv); \
+    const int64_t n_embd         = hparams.n_embd;           GGML_UNUSED(n_embd); \
+    const int64_t n_embd_k_gqa   = hparams.n_embd_k_gqa();   GGML_UNUSED(n_embd_k_gqa); \
+    const int64_t n_embd_v_gqa   = hparams.n_embd_v_gqa();   GGML_UNUSED(n_embd_v_gqa); \
+    const int64_t n_embd_head_k  = hparams.n_embd_head_k();  GGML_UNUSED(n_embd_head_k); \
+    const int64_t n_embd_head_v  = hparams.n_embd_head_v();  GGML_UNUSED(n_embd_head_v); \
+    const int64_t n_ff           = hparams.n_ff();           GGML_UNUSED(n_ff); \
+    const int64_t n_embd_gqa     = n_embd_v_gqa;             GGML_UNUSED(n_embd_gqa); \
+    const int64_t n_vocab        = vocab.n_tokens();         GGML_UNUSED(n_vocab); \
+    const int64_t n_token_types  = vocab.n_token_types();    GGML_UNUSED(n_token_types); \
+    const int64_t n_rot          = hparams.n_rot();          GGML_UNUSED(n_rot); \
+    const int64_t n_expert       = hparams.n_expert;         GGML_UNUSED(n_expert); \
+    const int64_t n_expert_used  = hparams.n_expert_used;    GGML_UNUSED(n_expert_used); \
+    const int64_t n_ctx_train    = hparams.n_ctx_train;      GGML_UNUSED(n_ctx_train);
+
 // For internal test use
 // TODO: remove
 const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model);
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 2f0f70b73b6c..43e05c3d56fe 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -882,13 +882,18 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         fname_inp, splits, /*file*/ nullptr, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
     ml.init_mappings(false); // no prefetching
 
-    llama_model model(llama_model_default_params());
+    auto mparams = llama_model_default_params();
+    std::unique_ptr<llama_model> model_ptr(llama_model_create(ml, mparams));
 
-    model.load_arch   (ml);
-    model.load_hparams(ml);
-    model.load_stats  (ml);
+    auto * model = dynamic_cast<llama_model_base *>(model_ptr.get());
+    if (model == nullptr) {
+        GGML_ABORT("fatal error: model does not implement llama_model_base");
+    }
+
+    model->load_hparams(ml);
+    model->load_stats  (ml);
 
-    quantize_state_impl qs(model, params);
+    quantize_state_impl qs(*model, params);
 
     if (params->only_copy) {
         ftype = ml.ftype;
@@ -1023,7 +1028,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         }
         gguf_add_tensor(ctx_outs[i_split].get(), tensor);
 
-        metadata[i].allows_quantization = tensor_allows_quantization(params, model.arch, tensor);
+        metadata[i].allows_quantization = tensor_allows_quantization(params, model->arch, tensor);
 
         if (metadata[i].allows_quantization) {
             metadata[i].target_type = llama_tensor_get_type(qs, params, tensor, default_type, metadata[i]);
@@ -1331,9 +1336,9 @@ void llama_quant_free(quantize_state_impl * qs) {
 
 llama_model * llama_quant_model_from_metadata(const llama_quant_model_desc * desc) {
     struct llama_model_params mparams = llama_model_default_params();
-    auto * model = new llama_model(mparams);
-
-    model->arch = llm_arch_from_string(desc->architecture);
+    auto arch = llm_arch_from_string(desc->architecture);
+    auto * model = llama_model_create(arch, mparams);
+    model->arch = arch;
 
     // infer llm_type: only LLM_TYPE_70B matters for quantization logic
     if (model->arch == LLM_ARCH_LLAMA && desc->n_layer == 80 && desc->n_head != desc->n_head_kv) {
diff --git a/src/llama.cpp b/src/llama.cpp
index e9c3028585d4..97529d8b75df 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -111,113 +111,8 @@ int64_t llama_time_us(void) {
     return ggml_time_us();
 }
 
-// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
-static int llama_model_load(struct gguf_context * metadata, llama_model_set_tensor_data_t set_tensor_data, void * set_tensor_data_ud,
-        const std::string & fname, std::vector<std::string> & splits, FILE * file, llama_model & model, llama_model_params & params) {
-    // loading time will be recalculated after the first eval, so
-    // we take page faults deferred by mmap() into consideration
-    model.t_load_us = 0;
-    time_meas tm(model.t_load_us);
-
-    model.t_start_us = tm.t_start_us;
-
-    try {
-        llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, file, params.use_mmap, params.use_direct_io,
-            params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
-
-        ml.print_info();
-
-        model.hparams.vocab_only = params.vocab_only;
-        model.hparams.no_alloc   = params.no_alloc;
-
-        try {
-            model.load_arch(ml);
-        } catch(const std::exception & e) {
-            throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
-        }
-        try {
-            model.load_hparams(ml);
-        } catch(const std::exception & e) {
-            throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
-        }
-        if (model.arch == LLM_ARCH_CLIP) {
-            throw std::runtime_error("CLIP cannot be used as main model, use it with --mmproj instead");
-        }
-        try {
-            model.load_vocab(ml);
-        } catch(const std::exception & e) {
-            throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
-        }
-
-        model.load_stats(ml);
-        model.print_info();
-
-        if (params.vocab_only) {
-            LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
-            return 0;
-        }
-
-        if (!model.load_tensors(ml)) {
-            return -2;
-        }
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
-        return -1;
-    }
-
-    return 0;
-}
-
-static struct llama_model * llama_model_load_from_file_impl(
-        struct gguf_context * metadata,
-        llama_model_set_tensor_data_t set_tensor_data,
-        void * set_tensor_data_ud,
-        const std::string & path_model,
-        std::vector<std::string> & splits,
-        FILE * file,
-        struct llama_model_params params) {
-    {
-        int n_sources_defined = 0;
-        if (metadata != nullptr) {
-            n_sources_defined++;
-        }
-        if (!path_model.empty()) {
-            n_sources_defined++;
-        }
-        if (file != nullptr) {
-            n_sources_defined++;
-        }
-        if (n_sources_defined != 1) {
-            LLAMA_LOG_ERROR("%s: exactly one out metadata, path_model, and file must be defined\n", __func__);
-            return nullptr;
-        }
-    }
-    ggml_time_init();
-
-    if (!params.vocab_only && ggml_backend_reg_count() == 0) {
-        LLAMA_LOG_ERROR("%s: no backends are loaded. hint: use ggml_backend_load() or ggml_backend_load_all() to load a backend before calling this function\n", __func__);
-        return nullptr;
-    }
-
-    unsigned cur_percentage = 0;
-    if (params.progress_callback == NULL) {
-        params.progress_callback_user_data = &cur_percentage;
-        params.progress_callback = [](float progress, void * ctx) {
-            unsigned * cur_percentage_p = (unsigned *) ctx;
-            unsigned percentage = (unsigned) (100 * progress);
-            while (percentage > *cur_percentage_p) {
-                *cur_percentage_p = percentage;
-                LLAMA_LOG_CONT(".");
-                if (percentage >= 100) {
-                    LLAMA_LOG_CONT("\n");
-                }
-            }
-            return true;
-        };
-    }
-
-    llama_model * model = new llama_model(params);
-
+// returns true on success
+static bool llama_prepare_model_devices(const llama_model_params & params, llama_model * model) {
     // create list of devices to use with this model
     if (params.devices) {
         if (params.split_mode == LLAMA_SPLIT_MODE_TENSOR) {
@@ -227,7 +122,7 @@ static struct llama_model * llama_model_load_from_file_impl(
             }
             if (n_devs == 0) {
                 LLAMA_LOG_ERROR("%s: LLAMA_SPLIT_MODE_TENSOR needs >= 1 devices\n", __func__);
-                return nullptr;
+                return false;
             }
             LLAMA_LOG_INFO("%s: creating a Meta device with %zu devices\n", __func__, n_devs);
             for (size_t i = 0; i < n_devs; ++i) {
@@ -265,7 +160,7 @@ static struct llama_model * llama_model_load_from_file_impl(
             }
             if (devs.empty()) {
                 LLAMA_LOG_ERROR("%s: LLAMA_SPLIT_MODE_TENSOR needs >= 1 devices\n", __func__);
-                return nullptr;
+                return false;
             }
 
             LLAMA_LOG_INFO("%s: creating a Meta device for tensor parallelism from %zu devices:\n", __func__, devs.size());
@@ -347,8 +242,7 @@ static struct llama_model * llama_model_load_from_file_impl(
         } else {
             if (params.main_gpu >= (int)model->devices.size()) {
                 LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %zu)\n", __func__, params.main_gpu, model->devices.size());
-                llama_model_free(model);
-                return nullptr;
+                return false;
             }
             llama_device main_gpu = model->devices[params.main_gpu];
             model->devices.clear();
@@ -365,7 +259,121 @@ static struct llama_model * llama_model_load_from_file_impl(
                 props.memory_free/1024/1024);
     }
 
-    const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, file, *model, params);
+    return true;
+}
+
+// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
+static std::pair<int, llama_model *> llama_model_load(struct gguf_context * metadata, llama_model_set_tensor_data_t set_tensor_data, void * set_tensor_data_ud,
+        const std::string & fname, std::vector<std::string> & splits, FILE * file, llama_model_params & params) {
+    try {
+        llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, file, params.use_mmap, params.use_direct_io,
+            params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
+
+        ml.print_info();
+        std::unique_ptr<llama_model> model_ptr(llama_model_create(ml, params));
+
+        bool ok = llama_prepare_model_devices(params, model_ptr.get());
+        if (!ok) {
+            return {-1, nullptr};
+        }
+
+        auto * model = dynamic_cast<llama_model_base *>(model_ptr.get());
+        if (model == nullptr) {
+            GGML_ABORT("fatal error: model does not implement llama_model_base");
+        }
+
+        // loading time will be recalculated after the first eval, so
+        // we take page faults deferred by mmap() into consideration
+        model->t_load_us = 0;
+        time_meas tm(model->t_load_us);
+
+        model->t_start_us = tm.t_start_us;
+
+        model->hparams.vocab_only = params.vocab_only;
+        model->hparams.no_alloc   = params.no_alloc;
+
+        try {
+            model->load_hparams(ml);
+        } catch(const std::exception & e) {
+            throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
+        }
+        if (model->arch == LLM_ARCH_CLIP) {
+            throw std::runtime_error("CLIP cannot be used as main model, use it with --mmproj instead");
+        }
+        try {
+            model->load_vocab(ml);
+        } catch(const std::exception & e) {
+            throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
+        }
+
+        model->load_stats(ml);
+        model->print_info();
+
+        if (params.vocab_only) {
+            LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
+            return {0, model_ptr.release()};
+        }
+
+        if (!model->load_tensors(ml)) {
+            return {-2, nullptr};
+        }
+
+        return {0, model_ptr.release()};
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
+        return {-1, nullptr};
+    }
+}
+
+static struct llama_model * llama_model_load_from_file_impl(
+        struct gguf_context * metadata,
+        llama_model_set_tensor_data_t set_tensor_data,
+        void * set_tensor_data_ud,
+        const std::string & path_model,
+        std::vector<std::string> & splits,
+        FILE * file,
+        struct llama_model_params params) {
+    {
+        int n_sources_defined = 0;
+        if (metadata != nullptr) {
+            n_sources_defined++;
+        }
+        if (!path_model.empty()) {
+            n_sources_defined++;
+        }
+        if (file != nullptr) {
+            n_sources_defined++;
+        }
+        if (n_sources_defined != 1) {
+            LLAMA_LOG_ERROR("%s: exactly one out metadata, path_model, and file must be defined\n", __func__);
+            return nullptr;
+        }
+    }
+    ggml_time_init();
+
+    if (!params.vocab_only && ggml_backend_reg_count() == 0) {
+        LLAMA_LOG_ERROR("%s: no backends are loaded. hint: use ggml_backend_load() or ggml_backend_load_all() to load a backend before calling this function\n", __func__);
+        return nullptr;
+    }
+
+    unsigned cur_percentage = 0;
+    if (params.progress_callback == NULL) {
+        params.progress_callback_user_data = &cur_percentage;
+        params.progress_callback = [](float progress, void * ctx) {
+            unsigned * cur_percentage_p = (unsigned *) ctx;
+            unsigned percentage = (unsigned) (100 * progress);
+            while (percentage > *cur_percentage_p) {
+                *cur_percentage_p = percentage;
+                LLAMA_LOG_CONT(".");
+                if (percentage >= 100) {
+                    LLAMA_LOG_CONT("\n");
+                }
+            }
+            return true;
+        };
+    }
+
+    const auto [status, model] = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, file, params);
     GGML_ASSERT(status <= 0);
     if (status < 0) {
         if (status == -1) {
@@ -374,7 +382,9 @@ static struct llama_model * llama_model_load_from_file_impl(
             LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
         }
 
-        llama_model_free(model);
+        if (model) {
+            llama_model_free(model);
+        }
         return nullptr;
     }
 
diff --git a/src/models/afmoe.cpp b/src/models/afmoe.cpp
index 2790b12111da..602e3176afd0 100644
--- a/src/models/afmoe.cpp
+++ b/src/models/afmoe.cpp
@@ -1,6 +1,112 @@
 #include "models.h"
 
-llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_afmoe::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
+    ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
+    ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
+    ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa, false);
+
+    // Set up interleaved sliding window attention (ISWA)
+    // Pattern: 3 sliding - 1 full (global_attn_every_n_layers = 4)
+    if (hparams.n_swa > 0) {
+        hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+        uint32_t swa_period = 4;
+        ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+        hparams.set_swa_pattern(swa_period);
+
+        hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
+        hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
+        ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+    } else {
+        hparams.swa_type = LLAMA_SWA_TYPE_NONE;
+    }
+
+    // Default to sigmoid if not set
+    if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
+        hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
+    }
+
+    switch (hparams.n_layer) {
+        case 56: type = LLM_TYPE_6B; break;
+        case 32: type = LLM_TYPE_26B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_afmoe::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+    const int64_t n_expert_shared = hparams.n_expert_shared;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    const int64_t n_ff_exp = hparams.n_ff_exp;
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        // dual attention normalization
+        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), {n_embd}, 0);
+        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
+
+        // attention projections
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+        // Q/K normalization
+        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+
+        // attention gating
+        layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+
+        // dual ffn normalization
+        layer.ffn_norm      = create_tensor(tn(LLM_TENSOR_FFN_NORM,      "weight", i), {n_embd}, 0);
+        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
+
+        if (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) {
+            // MoE layers
+            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
+
+            // grouped expert weights
+            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
+            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
+            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
+
+            // shared expert
+            if (n_expert_shared > 0) {
+                const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
+                layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
+                layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
+                layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_shexp}, 0);
+            }
+        } else {
+            // Dense layers
+            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+        }
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_afmoe::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_afmoe::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
 
diff --git a/src/models/apertus.cpp b/src/models/apertus.cpp
index af44cea60541..136ff7029571 100644
--- a/src/models/apertus.cpp
+++ b/src/models/apertus.cpp
@@ -1,6 +1,62 @@
 #include "models.h"
 
-llm_build_apertus::llm_build_apertus(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_apertus::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_N,        hparams.xielu_alpha_n, hparams.n_layer);
+    ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_P,        hparams.xielu_alpha_p, hparams.n_layer);
+    ml.get_key_or_arr(LLM_KV_XIELU_BETA,           hparams.xielu_beta,    hparams.n_layer);
+    ml.get_key_or_arr(LLM_KV_XIELU_EPS,            hparams.xielu_eps,     hparams.n_layer);
+
+    switch (hparams.n_layer) {
+        case 32: type = LLM_TYPE_8B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_apertus::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), { n_embd, n_vocab }, 0);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
+
+        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
+            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+        } else {
+            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+        }
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_gqa, n_embd_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
+
+        // optional bias tensors
+        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
+        layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
+
+        // Q and K layernorms for Apertus
+        layer.attn_q_norm   = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
+        layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), { n_embd_head_k }, TENSOR_NOT_REQUIRED);
+        layer.attn_k_norm   = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
+        layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), { n_embd_head_k }, TENSOR_NOT_REQUIRED);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_apertus::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_apertus::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/arcee.cpp b/src/models/arcee.cpp
index 2e71f5d9e2a3..70e86d41130f 100644
--- a/src/models/arcee.cpp
+++ b/src/models/arcee.cpp
@@ -1,6 +1,51 @@
 #include "models.h"
 
-llm_build_arcee::llm_build_arcee(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_arcee::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+    // Arcee uses the same structure as Llama
+    switch (hparams.n_layer) {
+        case 36: type = LLM_TYPE_4B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_arcee::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_arcee::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_arcee::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/arctic.cpp b/src/models/arctic.cpp
index f8ca6aff6abb..d8653a44639d 100644
--- a/src/models/arctic.cpp
+++ b/src/models/arctic.cpp
@@ -1,6 +1,59 @@
 #include "models.h"
 
-llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_arctic::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+    if (hparams.n_expert == 128) {
+        switch (hparams.n_layer) {
+            case 35: type = LLM_TYPE_10B_128x3_66B; break;
+            default: type = LLM_TYPE_UNKNOWN;
+        }
+    } else {
+        type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_arctic::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_embd}, 0);
+
+        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+        layer.ffn_norm_exps = create_tensor(tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd}, 0);
+        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, false);
+        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
+        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_arctic::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_arctic::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/arwkv7.cpp b/src/models/arwkv7.cpp
index 107a3bef8daf..79aa8c90899f 100644
--- a/src/models/arwkv7.cpp
+++ b/src/models/arwkv7.cpp
@@ -1,7 +1,123 @@
 #include "models.h"
 
+void llama_model_arwkv7::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,                hparams.f_norm_eps, false);
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,            hparams.f_norm_rms_eps, false);
+    ml.get_key(LLM_KV_WKV_HEAD_SIZE,                          hparams.wkv_head_size);
+    ml.get_key(LLM_KV_ATTENTION_DECAY_LORA_RANK,              hparams.n_lora_decay);
+    ml.get_key(LLM_KV_ATTENTION_ICLR_LORA_RANK,               hparams.n_lora_iclr);
+    ml.get_key(LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK, hparams.n_lora_value_res_mix);
+    ml.get_key(LLM_KV_ATTENTION_GATE_LORA_RANK,               hparams.n_lora_gate, false);
+    ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT,                      hparams.token_shift_count, false);
+
+    switch (hparams.n_layer) {
+        case 12:
+            switch (hparams.n_embd) {
+                case 768: type = LLM_TYPE_190M; break;
+                default: type = LLM_TYPE_UNKNOWN;
+            } break;
+        case 24:
+            switch (hparams.n_embd) {
+                case 1024: type = LLM_TYPE_450M; break;
+                case 2048: type = LLM_TYPE_1_5B; break;
+                default: type = LLM_TYPE_UNKNOWN;
+            } break;
+        case 28:
+            switch (hparams.n_embd) {
+                case 1536: type = LLM_TYPE_1_5B; break;
+                case 3584: type = LLM_TYPE_7B; break;
+                default: type = LLM_TYPE_UNKNOWN;
+            } break;
+        case 32:
+            switch (hparams.n_embd) {
+                case 2560: type = LLM_TYPE_2_9B; break;
+                case 4096: type = LLM_TYPE_7B; break;
+                default: type = LLM_TYPE_UNKNOWN;
+            } break;
+        case 61:
+            switch (hparams.n_embd) {
+                case 4096: type = LLM_TYPE_14B; break;
+                default: type = LLM_TYPE_UNKNOWN;
+            } break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_arwkv7::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+    const int n_lora_decay = hparams.n_lora_decay;
+    const int n_lora_iclr = hparams.n_lora_iclr;
+    const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
+    const int n_lora_gate = hparams.n_lora_gate;
+    const int attn_hidden_size = n_embd;
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
+        layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
+        layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
+
+        layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
+        layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
+        layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
+
+        if (i == 0) {
+            // actually not used
+            layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
+            layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
+            layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
+        } else {
+            layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
+            layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
+            layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
+        }
+
+        layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, TENSOR_NOT_REQUIRED);
+        layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, TENSOR_NOT_REQUIRED);
+
+        try {
+            layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
+        } catch(std::runtime_error & e) {
+            // ARWKV models may not have gate tensors
+            layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
+        }
+
+        layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
+        layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
+        layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
+
+        layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
+        layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
+        layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
+
+        layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+        layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+        layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+    }
+
+}
+
+std::unique_ptr<llm_graph_context> llama_model_arwkv7::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
 
-llm_build_arwkv7::llm_build_arwkv7(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv7_base(model, params) {
+llama_model_arwkv7::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv7_base(model, params) {
     GGML_ASSERT(n_embd == hparams.n_embd_r());
 
     ggml_tensor * cur;
diff --git a/src/models/baichuan.cpp b/src/models/baichuan.cpp
index 2d0d05df4850..4e55290e4e5e 100644
--- a/src/models/baichuan.cpp
+++ b/src/models/baichuan.cpp
@@ -1,6 +1,49 @@
 #include "models.h"
 
-llm_build_baichuan::llm_build_baichuan(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_baichuan::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    switch (hparams.n_layer) {
+        case 32: type = LLM_TYPE_7B; break;
+        case 40: type = LLM_TYPE_13B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+
+    if (type == LLM_TYPE_13B) {
+        // TODO: become GGUF KV parameter
+        hparams.f_max_alibi_bias = 8.0f;
+    }
+}
+
+void llama_model_baichuan::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+    {
+        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+        output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_baichuan::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_baichuan::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/bailingmoe.cpp b/src/models/bailingmoe.cpp
index 67a7120d6228..030dd4f42a45 100644
--- a/src/models/bailingmoe.cpp
+++ b/src/models/bailingmoe.cpp
@@ -1,6 +1,65 @@
 #include "models.h"
 
-llm_build_bailingmoe::llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_bailingmoe::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
+    ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
+
+    switch (hparams.n_layer) {
+        case 28: type = LLM_TYPE_16B; break;
+        case 88: type = LLM_TYPE_290B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_bailingmoe::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+    const int64_t n_expert_shared = hparams.n_expert_shared;
+
+    const int64_t n_ff_exp            = hparams.n_ff_exp;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_head * n_rot, n_head_kv * n_rot, n_head_kv * n_rot, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+
+        if (n_expert == 0) {
+            throw std::runtime_error("n_expert must be > 0");
+        }
+        if (n_expert_used == 0) {
+            throw std::runtime_error("n_expert_used must be > 0");
+        }
+
+        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
+        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+
+        layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
+        layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, 0);
+        layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_bailingmoe::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_bailingmoe::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     ggml_tensor * cur;
     ggml_tensor * inpL;
 
diff --git a/src/models/bailingmoe2.cpp b/src/models/bailingmoe2.cpp
index 497b4babd0cb..e7fe3d5b45ae 100644
--- a/src/models/bailingmoe2.cpp
+++ b/src/models/bailingmoe2.cpp
@@ -1,6 +1,100 @@
 #include "models.h"
 
-llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params) :
+void llama_model_bailingmoe2::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
+    ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead, false);
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
+    ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
+    ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,               hparams.n_expert_shared);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,               hparams.expert_weights_norm, false);
+    ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func);
+    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,              hparams.nextn_predict_layers, false);
+    GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
+
+    // TODO: when MTP is implemented, this should probably be updated if needed
+    hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
+
+    switch (hparams.n_layer) {
+        case 20: type = LLM_TYPE_16B_A1B; break;
+        case 21: type = LLM_TYPE_16B_A1B; break;
+        case 32: type = LLM_TYPE_100B_A6B; break;
+        case 33: type = LLM_TYPE_100B_A6B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_bailingmoe2::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+    const int64_t n_expert_shared = hparams.n_expert_shared;
+
+    const int64_t n_ff_exp        = hparams.n_ff_exp;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+    GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for bailingmoe2");
+    GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for bailingmoe2");
+
+    for (int i = 0; i < n_layer; ++i) {
+        int flags = 0;
+        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+            // skip all tensors in the NextN layers
+            flags |= TENSOR_SKIP;
+        }
+
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
+
+        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, flags);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, flags);
+
+        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags);
+        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
+
+        if (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) { // MoE layers
+            const int64_t n_ff_shexp = (hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff_exp) * n_expert_shared;
+
+            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
+            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED | flags);
+
+            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, flags);
+            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, flags);
+            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, flags);
+
+            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
+            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, flags);
+            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_shexp}, flags);
+        } else { // Dense layers
+            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, flags);
+            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, flags);
+            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, flags);
+        }
+
+        // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
+        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+            layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
+            layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags);
+            layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
+            layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
+            layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags);
+            layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED | flags);
+            layer.layer_out_norm         = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, flags);
+        }
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_bailingmoe2::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_bailingmoe2::graph::graph(const llama_model & model, const llm_graph_params & params) :
     llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
diff --git a/src/models/bert.cpp b/src/models/bert.cpp
index 7e046cfd2a49..3c28f419ccf2 100644
--- a/src/models/bert.cpp
+++ b/src/models/bert.cpp
@@ -1,6 +1,83 @@
 #include "models.h"
 
-llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_bert::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
+
+    switch (hparams.n_layer) {
+        case 3:
+            type = LLM_TYPE_17M; break; // bge-micro
+        case 6:
+            type = LLM_TYPE_22M; break; // MiniLM-L6
+        case 12:
+            switch (hparams.n_embd) {
+                case 384: type = LLM_TYPE_33M; break; // MiniLM-L12, bge-small
+                case 768: type = LLM_TYPE_109M; break; // bge-base
+                default: type = LLM_TYPE_UNKNOWN;
+            } break;
+        case 24:
+            type = LLM_TYPE_335M; break; // bge-large
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_bert::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    if (n_token_types == 0) {
+        throw std::runtime_error(arch_name() + " model needs to define token type count");
+    }
+    tok_embd     = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0);
+    type_embd    = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
+
+    if (arch == LLM_ARCH_BERT) {
+        pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,    "weight"), {n_embd, n_ctx_train}, 0);
+
+        cls   = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
+        cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"),   {n_embd},         TENSOR_NOT_REQUIRED);
+
+        cls_out   = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
+        cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"),   {hparams.n_cls_out},         TENSOR_NOT_REQUIRED);
+    }
+
+    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight", 0), {n_embd}, 0);
+    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias",   0), {n_embd}, 0);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
+
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+        layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
+        layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i),   {n_embd}, 0);
+
+        if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
+            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff,   n_expert}, 0);
+            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff,   n_embd, n_expert}, 0);
+            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,   "weight", i), {n_embd, n_expert}, 0);
+        } else {
+            layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, TENSOR_NOT_REQUIRED);
+            layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
+
+            if (arch == LLM_ARCH_NOMIC_BERT) {
+                layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+            }
+        }
+
+        layer.layer_out_norm   = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
+        layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i),   {n_embd}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_bert::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_bert::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/bitnet.cpp b/src/models/bitnet.cpp
index 71526354ca6a..7e8125deec40 100644
--- a/src/models/bitnet.cpp
+++ b/src/models/bitnet.cpp
@@ -1,7 +1,54 @@
 #include "models.h"
 
+void llama_model_bitnet::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
-llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+    switch (hparams.n_layer) {
+        case 26: type = LLM_TYPE_3B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_bitnet::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm     = create_tensor(tn(LLM_TENSOR_ATTN_NORM,     "weight", i), {n_embd}, 0);
+        layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
+
+        layer.wq       = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+        layer.wq_s     = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
+        layer.wk       = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+        layer.wk_s     = create_tensor(tn(LLM_TENSOR_ATTN_K,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
+        layer.wv       = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+        layer.wv_s     = create_tensor(tn(LLM_TENSOR_ATTN_V,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
+        layer.wo       = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+        layer.wo_s     = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale",  i), {1}, TENSOR_NOT_REQUIRED);
+
+        layer.ffn_norm     = create_tensor(tn(LLM_TENSOR_FFN_NORM,     "weight", i), {n_embd}, 0);
+        layer.ffn_sub_norm = create_tensor(tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}, 0);
+
+        layer.ffn_gate       = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+        layer.ffn_gate_s = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale",  i), {1}, TENSOR_NOT_REQUIRED);
+        layer.ffn_down       = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+        layer.ffn_down_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale",  i), {1}, TENSOR_NOT_REQUIRED);
+        layer.ffn_up         = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+        layer.ffn_up_s   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "scale",  i), {1}, TENSOR_NOT_REQUIRED);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_bitnet::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_bitnet::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/bloom.cpp b/src/models/bloom.cpp
index f3b0999bf541..b600fb0c9547 100644
--- a/src/models/bloom.cpp
+++ b/src/models/bloom.cpp
@@ -1,6 +1,68 @@
 #include "models.h"
 
-llm_build_bloom::llm_build_bloom(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_bloom::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+
+    switch (hparams.n_layer) {
+        case 24: type = LLM_TYPE_1B; break;
+        case 30:
+            switch (hparams.n_embd) {
+                case 2560: type = LLM_TYPE_3B; break;
+                case 4096: type = LLM_TYPE_7B; break;
+                default: type = LLM_TYPE_UNKNOWN;
+            } break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+
+    // TODO: become GGUF KV parameter
+    hparams.f_max_alibi_bias = 8.0f;
+}
+
+void llama_model_bloom::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab}, 0);
+    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight", 0), {n_embd}, 0);
+    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias",   0), {n_embd}, 0);
+
+    // output
+    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias",   i), {n_embd}, 0);
+
+        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+        layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
+
+        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), {n_embd}, 0);
+
+        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias",   i), {n_embd}, 0);
+
+        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias",   i), {n_embd}, 0);
+
+        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias",   i), {n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_bloom::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_bloom::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/chameleon.cpp b/src/models/chameleon.cpp
index 21deaba1a6df..8510b9e29f84 100644
--- a/src/models/chameleon.cpp
+++ b/src/models/chameleon.cpp
@@ -1,8 +1,56 @@
 #include "models.h"
-
 #include <float.h>
 
-llm_build_chameleon::llm_build_chameleon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_chameleon::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    hparams.f_norm_eps = 1e-5;  // eps for qk-norm, torch default
+    ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm, false);
+
+    switch (hparams.n_layer) {
+        case 32: type = LLM_TYPE_7B; break;
+        case 48: type = LLM_TYPE_34B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+   }
+}
+
+void llama_model_chameleon::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
+        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
+        layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i),  {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
+        layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i),  {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_chameleon::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_chameleon::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/chatglm.cpp b/src/models/chatglm.cpp
index 7d4a43fdca54..e898eff79392 100644
--- a/src/models/chatglm.cpp
+++ b/src/models/chatglm.cpp
@@ -1,7 +1,60 @@
 #include "models.h"
 
+void llama_model_chatglm::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    switch (hparams.n_layer) {
+        case 28: {
+            if (hparams.n_head(0) == 16) {
+                type = LLM_TYPE_1_5B;
+            } else {
+                type = LLM_TYPE_6B;
+            }
+        } break;
+        case 40: {
+            if (hparams.n_head(0) == 24) {
+                type = LLM_TYPE_4B;
+            } else {
+                type = LLM_TYPE_9B;
+            }
+        } break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_chatglm::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
+
+        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff * 2}, 0);
+
+        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_chatglm::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
 
-llm_build_chatglm::llm_build_chatglm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+llama_model_chatglm::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/codeshell.cpp b/src/models/codeshell.cpp
index 3ceb5835b854..e9e85d967139 100644
--- a/src/models/codeshell.cpp
+++ b/src/models/codeshell.cpp
@@ -1,6 +1,55 @@
 #include "models.h"
 
-llm_build_codeshell::llm_build_codeshell(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_codeshell::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+    switch (hparams.n_layer) {
+        case 42: type = LLM_TYPE_7B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_codeshell::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+    // if tok embd is NULL, init from output
+    if (tok_embd == NULL) {
+        tok_embd = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    // output
+    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
+
+        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
+
+        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
+
+        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
+
+        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i),   {n_embd, n_ff}, 0);
+        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i),     {n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_codeshell::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_codeshell::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/cogvlm.cpp b/src/models/cogvlm.cpp
index be3eeeddac7f..79236121bd55 100644
--- a/src/models/cogvlm.cpp
+++ b/src/models/cogvlm.cpp
@@ -1,6 +1,55 @@
 #include "models.h"
 
-llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) :
+void llama_model_cogvlm::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    switch (hparams.n_layer) {
+        case 32: type = LLM_TYPE_13B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_cogvlm::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd_head_k * n_head * 3}, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+        layer.visexp_attn_wqkv = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_QKV, "weight", i), {n_embd, n_embd_head_k * n_head * 3}, 0);
+        layer.visexp_attn_wo = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+
+        layer.visexp_ffn_gate = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+        layer.visexp_ffn_down = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.visexp_ffn_up   = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_cogvlm::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_cogvlm::graph::graph(const llama_model & model, const llm_graph_params & params) :
     llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
     const float   kq_scale    = 1.0f / sqrtf(float(n_embd_head));
diff --git a/src/models/cohere2-iswa.cpp b/src/models/cohere2.cpp
similarity index 60%
rename from src/models/cohere2-iswa.cpp
rename to src/models/cohere2.cpp
index 670b08e7d97c..12edbae10941 100644
--- a/src/models/cohere2-iswa.cpp
+++ b/src/models/cohere2.cpp
@@ -1,6 +1,53 @@
 #include "models.h"
 
-llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_cohere2::load_arch_hparams(llama_model_loader & ml) {
+    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+    uint32_t swa_period = 4;
+    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+    hparams.set_swa_pattern(swa_period);
+    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
+    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
+
+    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,       hparams.rope_freq_base_train_swa, false);
+    ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
+    ml.get_key(LLM_KV_LOGIT_SCALE,              hparams.f_logit_scale);
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,  hparams.f_norm_eps);
+    switch (hparams.n_layer) {
+        case 32: type = LLM_TYPE_8B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_cohere2::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
+    // init output from the input tok embed
+    output      = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab },
+                                      TENSOR_DUPLICATED);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
+
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_cohere2::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_cohere2::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/command-r.cpp b/src/models/command-r.cpp
index 067961caa086..decb89f547b0 100644
--- a/src/models/command-r.cpp
+++ b/src/models/command-r.cpp
@@ -1,8 +1,48 @@
 #include "models.h"
 
+void llama_model_command_r::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_LOGIT_SCALE,             hparams.f_logit_scale, false);
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+    switch (hparams.n_layer) {
+        case 40: type = LLM_TYPE_35B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_command_r::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    // init output from the input tok embed
+    output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
 
+        if (n_layer >= 64){
+            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
+            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
+        }
+
+        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_command_r::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
 
-llm_build_command_r::llm_build_command_r(const llama_model & model, const llm_graph_params & params) :
+llama_model_command_r::graph::graph(const llama_model & model, const llm_graph_params & params) :
     llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
diff --git a/src/models/dbrx.cpp b/src/models/dbrx.cpp
index 0e8827218076..bce6b04bcf9b 100644
--- a/src/models/dbrx.cpp
+++ b/src/models/dbrx.cpp
@@ -1,6 +1,50 @@
 #include "models.h"
 
-llm_build_dbrx::llm_build_dbrx(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_dbrx::load_arch_hparams(llama_model_loader & ml) {
+ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV,     hparams.f_clamp_kqv);
+
+switch (hparams.n_layer) {
+    case 40: type = LLM_TYPE_16x12B; break;
+    default: type = LLM_TYPE_UNKNOWN;
+}
+        }
+
+void llama_model_dbrx::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    if (n_expert == 0) {
+        throw std::runtime_error("DBRX model cannot have zero experts");
+    }
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+        layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
+        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff,   n_expert}, 0);
+        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert}, 0);
+        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff,   n_expert}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_dbrx::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_dbrx::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/deci.cpp b/src/models/deci.cpp
index 30272eabd69e..9f1a959c32c3 100644
--- a/src/models/deci.cpp
+++ b/src/models/deci.cpp
@@ -1,6 +1,82 @@
 #include "models.h"
 
-llm_build_deci::llm_build_deci(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_deci::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    switch (hparams.n_layer) {
+        case 32: type = LLM_TYPE_7B; break;
+        case 80: type = LLM_TYPE_70B; break;
+        case 162: type = LLM_TYPE_405B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_deci::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+        const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa(i);
+        const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa(i);
+        const int64_t n_ff          = hparams.n_ff(i);
+        const int64_t n_head        = hparams.n_head(i);
+        const int64_t n_head_kv     = hparams.n_head_kv(i);
+
+        if (n_head_kv == 0 && n_head > 0) {
+            // linear attention for DeciLMCausalModel
+            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+        }
+        else if (n_head_kv > 0) {
+            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+            create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
+            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+        }
+
+        // optional bias tensors
+        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+        if (n_ff > 0) {
+            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        }
+
+        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
+            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+        }
+        else {
+            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+        }
+
+        if (n_ff > 0) {
+            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+        }
+
+        // optional MLP bias
+        layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_deci::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_deci::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/deepseek.cpp b/src/models/deepseek.cpp
index 671b72dfead7..c79460596623 100644
--- a/src/models/deepseek.cpp
+++ b/src/models/deepseek.cpp
@@ -1,6 +1,77 @@
 #include "models.h"
 
-llm_build_deepseek::llm_build_deepseek(const llama_model & model, const llm_graph_params & params) :
+void llama_model_deepseek::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
+    ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
+
+    switch (hparams.n_ff_exp) {
+        case 1408: type = LLM_TYPE_16B; break;
+        case 1792: type = LLM_TYPE_20B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_deepseek::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+    const int64_t n_expert_shared = hparams.n_expert_shared;
+
+
+    const int64_t n_ff_exp        = hparams.n_ff_exp;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    // try to load output.weight, if not found, use token_embd (tied embeddings)
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+    if (!output) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        if (i < (int) hparams.n_layer_dense_lead) {
+            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+        } else {
+            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+
+            if (n_expert == 0) {
+                throw std::runtime_error("n_expert must be > 0");
+            }
+            if (n_expert_used == 0) {
+                throw std::runtime_error("n_expert_used must be > 0");
+            }
+
+            // MoE branch
+            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
+            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+
+            // Shared expert branch
+            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
+            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, 0);
+            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
+        }
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_deepseek::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_deepseek::graph::graph(const llama_model & model, const llm_graph_params & params) :
     llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
diff --git a/src/models/deepseek2.cpp b/src/models/deepseek2.cpp
index 303fc72c6105..53574df4663a 100644
--- a/src/models/deepseek2.cpp
+++ b/src/models/deepseek2.cpp
@@ -1,6 +1,148 @@
 #include "models.h"
 
-llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
+void llama_model_deepseek2::load_arch_hparams(llama_model_loader & ml) {
+    const auto n_vocab = vocab.n_tokens();
+
+    // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B, Kanana-2-30B-A3B
+    const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26 || (hparams.n_layer == 48 && n_vocab == 128256));
+
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
+    if (!is_lite) {
+        ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
+    }
+    ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,     hparams.n_lora_kv);
+    ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA,   hparams.n_embd_head_k_mla_impl, false);
+    ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl, false);
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+    ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,        hparams.n_expert_shared);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,       hparams.expert_weights_scale, false);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,        hparams.expert_weights_norm, false);
+    ml.get_key(LLM_KV_EXPERT_GATING_FUNC,         hparams.expert_gating_func, false);
+    if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
+        // for compatibility with existing DeepSeek V2 and V2.5 GGUFs
+        // that have no expert_gating_func model parameter set
+        if ((hparams.n_layer == 47 || hparams.n_layer == 48) && n_vocab == 154880) {
+            // GLM 4.7 Lite
+            hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
+        } else {
+            hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
+        }
+    }
+
+    if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false)) {
+        // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
+        // cancel the factor from the convert script
+        hparams.rope_yarn_log_mul /= 0.1f;
+    }
+
+    // (optional) temperature tuning - used by mistral-large
+    ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE,  hparams.f_attn_temp_scale,       false);
+    ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false); // FIXME why not use temperature_length?
+
+    hparams.f_attn_temp_offset = 0.0f;
+
+    switch (hparams.n_layer) {
+        case 27: type = LLM_TYPE_16B; break;
+        case 47: type = LLM_TYPE_30B_A3B; break;
+        case 60: type = LLM_TYPE_236B; break;
+        case 61: type = LLM_TYPE_671B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_deepseek2::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+    const int64_t n_expert_shared = hparams.n_expert_shared;
+
+    const bool is_mla = hparams.is_mla();
+
+    // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
+    const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
+    const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
+
+    const int64_t n_embd_head_qk_rope = hparams.n_rot();
+    const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
+    GGML_ASSERT(n_embd_head_qk_nope >= 1);
+
+    const int64_t q_lora_rank  = hparams.n_lora_q;
+    const int64_t kv_lora_rank = hparams.n_lora_kv;
+
+    const int64_t n_ff_exp        = hparams.n_ff_exp;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    // try to load output.weight, if not found, use token_embd (tied embeddings)
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+    if (!output) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+        if (q_lora_rank > 0) {
+            layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
+        }
+
+        layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
+
+        if (q_lora_rank > 0) {
+            layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
+            layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
+        } else {
+            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
+        }
+
+        layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, 0);
+
+        // note: only old legacy GGUF files will have the unsplit wkv_b tensor in
+        if (is_mla) {
+            layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, 0);
+            layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
+        } else {
+            layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v_mla)}, 0);
+        }
+
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        if (i < (int) hparams.n_layer_dense_lead) {
+            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+        } else {
+            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
+
+            if (n_expert == 0) {
+                throw std::runtime_error("n_expert must be > 0");
+            }
+            if (n_expert_used == 0) {
+                throw std::runtime_error("n_expert_used must be > 0");
+            }
+
+            // MoE branch
+            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
+            create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0);
+
+            // Shared expert branch
+            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
+            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, 0);
+            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
+        }
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_deepseek2::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_deepseek2::graph::graph(const llama_model & model, const llm_graph_params & params) :
     llm_graph_context(params) {
     // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
     bool is_ocr = model.arch == LLM_ARCH_DEEPSEEK2OCR;
diff --git a/src/models/deepseek2ocr.cpp b/src/models/deepseek2ocr.cpp
new file mode 100644
index 000000000000..f9e4c98785c1
--- /dev/null
+++ b/src/models/deepseek2ocr.cpp
@@ -0,0 +1,82 @@
+#include "models.h"
+
+void llama_model_deepseek2ocr::load_arch_hparams(llama_model_loader & ml) {
+    // similar to deepseek2, but without MLA
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
+    ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
+    ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
+
+    if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
+        hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
+    }
+
+    switch (hparams.n_layer) {
+        case 12: type = LLM_TYPE_3B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_deepseek2ocr::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+    const int64_t n_expert_shared = hparams.n_expert_shared;
+
+    // similar to deepseek2, but without MLA
+    const int64_t n_ff_exp        = hparams.n_ff_exp;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    // try to load output.weight, if not found, use token_embd (tied embeddings)
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+    if (!output) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd}, 0);
+        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd}, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+        // norm
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        if (i < (int) hparams.n_layer_dense_lead) {
+            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+        } else {
+            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
+
+            if (n_expert == 0) {
+                throw std::runtime_error("n_expert must be > 0");
+            }
+            if (n_expert_used == 0) {
+                throw std::runtime_error("n_expert_used must be > 0");
+            }
+
+            // MoE branch
+            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
+            create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0);
+
+            // Shared expert branch
+            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
+            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, 0);
+            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
+        }
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_deepseek2ocr::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
diff --git a/src/models/dots1.cpp b/src/models/dots1.cpp
index 5d1750fedda9..93cbcf9d9314 100644
--- a/src/models/dots1.cpp
+++ b/src/models/dots1.cpp
@@ -1,6 +1,76 @@
 #include "models.h"
 
-llm_build_dots1::llm_build_dots1(const llama_model & model, const llm_graph_params & params) :
+void llama_model_dots1::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
+    ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
+    ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
+    switch (hparams.n_layer) {
+        case 62: type = LLM_TYPE_142B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_dots1::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+    const int64_t n_expert_shared = hparams.n_expert_shared;
+
+    const int64_t n_ff_exp        = hparams.n_ff_exp;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_head_k * n_head, n_embd_head_k * n_head, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        if (i < (int) hparams.n_layer_dense_lead) {
+            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+        } else {
+            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
+
+            if (n_expert == 0) {
+                throw std::runtime_error("n_expert must be > 0");
+            }
+            if (n_expert_used == 0) {
+                throw std::runtime_error("n_expert_used must be > 0");
+            }
+
+            // MoE branch
+            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
+            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+
+            // Shared expert branch
+            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
+            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, 0);
+            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
+        }
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_dots1::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_dots1::graph::graph(const llama_model & model, const llm_graph_params & params) :
     llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
diff --git a/src/models/dream.cpp b/src/models/dream.cpp
index 8e7d9ae64c74..60a3f0ec285e 100644
--- a/src/models/dream.cpp
+++ b/src/models/dream.cpp
@@ -1,6 +1,54 @@
 #include "models.h"
 
-llm_build_dream::llm_build_dream(const llama_model & model, const llm_graph_params & params) :
+void llama_model_dream::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    // Dream models are primarily 7B with 28 layers
+    switch (hparams.n_layer) {
+        case 28:
+            type = LLM_TYPE_7B;
+            break;
+        default:
+            type = LLM_TYPE_UNKNOWN;
+    }
+    // Set non-causal attention for diffusion models
+    hparams.causal_attn = false;
+}
+
+void llama_model_dream::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+    output_b    = create_tensor(tn(LLM_TENSOR_OUTPUT,      "bias"),   {n_vocab}, TENSOR_NOT_REQUIRED);
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_dream::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_dream::graph::graph(const llama_model & model, const llm_graph_params & params) :
     llm_graph_context(params) {
     //copied from qwen2
     const int64_t n_embd_head = hparams.n_embd_head_v();
diff --git a/src/models/ernie4-5-moe.cpp b/src/models/ernie4-5-moe.cpp
index fc6a3e17a09d..2bd01a2c5129 100644
--- a/src/models/ernie4-5-moe.cpp
+++ b/src/models/ernie4-5-moe.cpp
@@ -1,6 +1,10 @@
 #include "models.h"
 
-llm_build_ernie4_5_moe::llm_build_ernie4_5_moe(const llama_model & model, const llm_graph_params & params) :
+std::unique_ptr<llm_graph_context> llama_model_ernie4_5_moe::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_ernie4_5_moe::graph::graph(const llama_model & model, const llm_graph_params & params) :
     llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
diff --git a/src/models/ernie4-5.cpp b/src/models/ernie4-5.cpp
index 033ba409eab0..fa989fe92cd1 100644
--- a/src/models/ernie4-5.cpp
+++ b/src/models/ernie4-5.cpp
@@ -1,6 +1,79 @@
 #include "models.h"
 
-llm_build_ernie4_5::llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params) :
+void llama_model_ernie4_5::load_arch_hparams(llama_model_loader & ml) {
+    // paddleocr need mrope_section
+    ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
+
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    if (arch == LLM_ARCH_ERNIE4_5_MOE) {
+        ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
+        ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
+        ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP,         hparams.n_moe_layer_step);
+        ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead, false);
+    }
+
+    switch (hparams.n_layer) {
+        case 18: type = LLM_TYPE_0_3B; break;
+        case 28: type = LLM_TYPE_21B_A3B; break;
+        case 54: type = LLM_TYPE_300B_A47B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_ernie4_5::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_gqa, n_embd_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+        // optional bias tensors
+        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        if (arch == LLM_ARCH_ERNIE4_5_MOE && static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) { // MoE layers
+            int n_ff_exp = hparams.n_ff_exp;
+
+            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
+            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
+            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
+            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff_exp, n_embd, n_expert}, 0);
+            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff_exp, n_expert}, 0);
+
+            // Shared expert (if present)
+            if (hparams.n_ff_shexp > 0) {
+                layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {    n_embd, hparams.n_ff_shexp}, 0);
+                layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd    }, 0);
+                layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {    n_embd, hparams.n_ff_shexp}, 0);
+            }
+        } else { // Dense layers
+            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+        }
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_ernie4_5::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_ernie4_5::graph::graph(const llama_model & model, const llm_graph_params & params) :
     llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
diff --git a/src/models/eurobert.cpp b/src/models/eurobert.cpp
index 43fff4daf3ad..ddf13c3028f2 100644
--- a/src/models/eurobert.cpp
+++ b/src/models/eurobert.cpp
@@ -1,6 +1,41 @@
 #include "models.h"
 
-llm_build_eurobert::llm_build_eurobert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_eurobert::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+    if (hparams.n_layer == 12) {
+        type = LLM_TYPE_SMALL;  // 0.2B
+    }
+}
+
+void llama_model_eurobert::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_eurobert::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_eurobert::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/exaone-moe.cpp b/src/models/exaone-moe.cpp
index 7b88a31d39d2..54bb3ca86b32 100644
--- a/src/models/exaone-moe.cpp
+++ b/src/models/exaone-moe.cpp
@@ -1,6 +1,117 @@
 #include "models.h"
 
-llm_build_exaone_moe::llm_build_exaone_moe(const llama_model & model, const llm_graph_params & params) :
+void llama_model_exaone_moe::load_arch_hparams(llama_model_loader & ml) {
+    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+    hparams.n_swa = 128;
+    uint32_t swa_period = 4;
+    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+    hparams.set_swa_pattern(swa_period);
+    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
+    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
+
+    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,                hparams.rope_freq_base_train_swa, false);
+    ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,          hparams.n_swa);
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
+    ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,               hparams.n_expert_shared, false);
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
+    ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
+    ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,               hparams.expert_weights_norm, false);
+    ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead, false);
+
+    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,              hparams.nextn_predict_layers, false);
+    GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
+
+    switch (hparams.n_layer) {
+        case 32: type = LLM_TYPE_30B_A3B; break;
+        case 48:
+        case 49: type = LLM_TYPE_235B_A22B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_exaone_moe::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    const int64_t n_ff_exp       = hparams.n_ff_exp;
+    const int64_t n_ff_shexp     = hparams.n_ff_shexp > 0 ? hparams.n_ff_shexp : n_ff_exp;
+    const int64_t head_dim       = hparams.n_embd_head_k();
+    const int64_t n_qo_dim       = n_head * head_dim;
+    const int64_t n_kv_dim       = n_head_kv * head_dim;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        int flags = 0;
+        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+            // skip all tensors in the NextN layers
+            flags |= TENSOR_SKIP;
+        }
+
+        auto & layer = layers[i];
+        create_tensor_qkv(layer, i, n_embd, n_qo_dim, n_kv_dim, n_kv_dim, flags);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, flags);
+
+        layer.rope_freqs   = create_tensor(tn(LLM_TENSOR_ROPE_FREQS,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0) | flags);
+
+        layer.attn_norm    = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, flags);
+        layer.attn_q_norm  = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags);
+        layer.attn_k_norm  = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags);
+
+        layer.ffn_norm     = create_tensor(tn(LLM_TENSOR_FFN_NORM,    "weight", i), {n_embd}, flags);
+
+        // dense layers for first n_layer_dense_lead layers or nextn_predict_layers layers at the end
+        if (i < (int) hparams.n_layer_dense_lead || (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers)) {
+            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
+            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, flags);
+            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, flags);
+        } else {
+            layer.ffn_gate_inp    = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, flags);
+            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED | flags);
+
+            if (n_expert == 0) {
+                throw std::runtime_error("n_expert must be > 0");
+            }
+            if (n_expert_used == 0) {
+                throw std::runtime_error("n_expert_used must be > 0");
+            }
+
+            layer.ffn_gate_exps  = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS,  "weight", i), {n_embd, n_ff_exp, n_expert}, flags);
+            layer.ffn_down_exps  = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS,  "weight", i), {n_ff_exp, n_embd, n_expert}, flags);
+            layer.ffn_up_exps    = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,    "weight", i), {n_embd, n_ff_exp, n_expert}, flags);
+
+            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
+            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, flags);
+            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_shexp}, flags);
+        }
+
+        // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
+        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+            layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), {2 * n_embd, n_embd}, flags);
+            layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM,   "weight", i), {n_embd}, flags);
+            layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM,   "weight", i), {n_embd}, flags);
+
+            layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), {n_embd}, flags | TENSOR_NOT_REQUIRED);
+            layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS,     "weight", i), {n_embd, n_vocab}, flags | TENSOR_NOT_REQUIRED);
+            layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), {n_embd, n_vocab}, flags | TENSOR_NOT_REQUIRED);
+        }
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_exaone_moe::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_exaone_moe::graph::graph(const llama_model & model, const llm_graph_params & params) :
     llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_k();
 
diff --git a/src/models/exaone.cpp b/src/models/exaone.cpp
index 4f845bf41060..75d5f60631c3 100644
--- a/src/models/exaone.cpp
+++ b/src/models/exaone.cpp
@@ -1,6 +1,49 @@
 #include "models.h"
 
-llm_build_exaone::llm_build_exaone(const llama_model & model, const llm_graph_params & params) :
+void llama_model_exaone::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+    switch (hparams.n_layer) {
+        case 32: type = LLM_TYPE_8B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_exaone::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM,   "weight", i), {n_embd}, 0);
+        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+        layer.ffn_gate   = create_tensor(tn(LLM_TENSOR_FFN_GATE,   "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN,   "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,     "weight", i), {n_embd,   n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_exaone::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_exaone::graph::graph(const llama_model & model, const llm_graph_params & params) :
     llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
diff --git a/src/models/exaone4.cpp b/src/models/exaone4.cpp
index 34bee3b8fe9e..5506e76424d9 100644
--- a/src/models/exaone4.cpp
+++ b/src/models/exaone4.cpp
@@ -1,7 +1,71 @@
 #include "models.h"
 
+void llama_model_exaone4::load_arch_hparams(llama_model_loader & ml) {
+    if (hparams.n_layer == 64) {    // 32B
+        hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+        hparams.n_swa = 4096;
+        uint32_t swa_period = 4;
+        ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+        hparams.set_swa_pattern(swa_period);
+
+        hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
+        hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
+        ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+    }
+
+    ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa, false);
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+    switch (hparams.n_layer) {
+        case 30: type = LLM_TYPE_1_2B; break;
+        case 64: type = LLM_TYPE_32B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_exaone4::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+
+        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
+        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_post_norm  = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_exaone4::build_arch_graph(const llm_graph_params & params) const {
+    if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
+        return std::make_unique<graph<true>>(*this, params);
+    } else {
+        return std::make_unique<graph<false>>(*this, params);
+    }
+}
+
 template <bool iswa>
-llm_build_exaone4<iswa>::llm_build_exaone4(const llama_model & model, const llm_graph_params & params) :
+llama_model_exaone4::graph<iswa>::graph(const llama_model & model, const llm_graph_params & params) :
     llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_k();
 
@@ -108,5 +172,5 @@ llm_build_exaone4<iswa>::llm_build_exaone4(const llama_model & model, const llm_
 }
 
 // Explicit template instantiations
-template struct llm_build_exaone4<false>;
-template struct llm_build_exaone4<true>;
+template struct llama_model_exaone4::graph<false>;
+template struct llama_model_exaone4::graph<true>;
diff --git a/src/models/falcon-h1.cpp b/src/models/falcon-h1.cpp
index 05accf90fadf..d353befdb8e4 100644
--- a/src/models/falcon-h1.cpp
+++ b/src/models/falcon-h1.cpp
@@ -1,6 +1,115 @@
 #include "models.h"
 
-llm_build_falcon_h1::llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params) :
+void llama_model_falcon_h1::load_arch_hparams(llama_model_loader & ml) {
+    // Common parameters
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+    // SSM parameters
+    ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
+    ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
+    ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
+    ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
+    ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
+
+    std::fill(hparams.recurrent_layer_arr.begin(), hparams.recurrent_layer_arr.end(), true);
+
+    switch (hparams.n_layer) {
+        case 36:
+            type = LLM_TYPE_0_5B; break;
+        case 24:
+            type = LLM_TYPE_1_5B; break;
+        case 66:
+            type = LLM_TYPE_1B; break;
+        case 32:
+            type = LLM_TYPE_3B; break;
+        case 44:
+            type = LLM_TYPE_7B; break;
+        case 72:
+            type = LLM_TYPE_34B; break;
+        default:
+            type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_falcon_h1::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    // Common
+    const int64_t hidden_size = hparams.n_embd; // hidden_size
+
+    // mamba2 Mixer SSM params
+    const int64_t ssm_conv_kernel_size  = hparams.ssm_d_conv; // ssm_conv_kernel_size
+    const int64_t ssm_n_groups          = hparams.ssm_n_group; // ssm_n_groups
+    const int64_t ssm_state_size        = hparams.ssm_d_state; // ssm_state_size
+    const int64_t ssm_intermediate_size = hparams.ssm_d_inner; // TODO expand
+    const int64_t ssm_num_heads         = hparams.ssm_dt_rank; // ssm_num_heads
+    const int64_t ssm_conv_dim          = ssm_intermediate_size + 2 * ssm_n_groups * ssm_state_size;
+    const int64_t ssm_projection_size   = ssm_intermediate_size + ssm_conv_dim + ssm_num_heads;
+
+    // attn params
+    const int64_t attn_num_attention_head = hparams.n_head(0); // rename to: attn_num_attention_head
+    const int64_t attn_num_key_value_head = hparams.n_head_kv(0);
+
+    // ffn params
+    const int64_t ffn_intermediate_size = hparams.n_ff(0);
+
+    // embeddings
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, 0);
+
+    // output
+    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hidden_size, n_vocab}, TENSOR_NOT_REQUIRED);
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {hidden_size}, 0);
+
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        /*SSM LAYERS*/
+        // ssm in
+        layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {hidden_size, ssm_projection_size}, 0);
+        // ssm 1d conv
+        layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {ssm_conv_kernel_size, ssm_conv_dim}, 0);
+        layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {ssm_conv_dim}, TENSOR_NOT_REQUIRED);
+        // ssm_dt
+        layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {ssm_num_heads}, 0);
+        // no "weight" suffix for these
+        layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, ssm_num_heads}, 0);
+        layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, ssm_num_heads}, 0);
+        // ssm_norm
+        layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, TENSOR_NOT_REQUIRED);
+        // out_proj
+        layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {ssm_intermediate_size, hidden_size}, 0);
+
+        /*ATTENTION LAYERS*/
+        // attention layers (with optional bias)
+        create_tensor_qkv(layer, i, hidden_size, n_embd_head_k * attn_num_attention_head, attn_num_key_value_head * n_embd_head_k, attn_num_key_value_head * n_embd_head_v, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * attn_num_attention_head, hidden_size}, 0);
+        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {hidden_size}, 0);
+
+
+        // feed forward (w/ optional biases)
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, i), {hidden_size}, 0);
+        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {hidden_size,   ffn_intermediate_size}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  ffn_intermediate_size, hidden_size}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {hidden_size,   ffn_intermediate_size}, 0);
+
+        layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
+        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
+        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_falcon_h1::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_falcon_h1::graph::graph(const llama_model & model, const llm_graph_params & params) :
     llm_build_mamba_base(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
diff --git a/src/models/falcon.cpp b/src/models/falcon.cpp
index 2f65fa56e1fd..75f2cfef5602 100644
--- a/src/models/falcon.cpp
+++ b/src/models/falcon.cpp
@@ -1,6 +1,53 @@
 #include "models.h"
 
-llm_build_falcon::llm_build_falcon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_falcon::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+
+    switch (hparams.n_layer) {
+        case 32: type = LLM_TYPE_7B; break;
+        case 60: type = LLM_TYPE_40B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_falcon::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    {
+        output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+        output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+
+        output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+        if (!output) {
+            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
+        }
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
+
+        layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+        layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
+
+        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_falcon::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_falcon::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/gemma-embedding.cpp b/src/models/gemma-embedding.cpp
index b6de9551c52c..4e07f5f2bdaf 100644
--- a/src/models/gemma-embedding.cpp
+++ b/src/models/gemma-embedding.cpp
@@ -1,6 +1,78 @@
 #include "models.h"
 
-llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) :
+void llama_model_gemma_embedding::load_arch_hparams(llama_model_loader & ml) {
+    hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
+    uint32_t swa_period = 6;
+    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+    hparams.set_swa_pattern(swa_period);
+
+    hparams.causal_attn = false; // embeddings do not use causal attention
+
+    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+    ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+    //applied only if model converted with --sentence-transformers-dense-modules
+    ml.get_key(LLM_KV_DENSE_2_FEAT_IN, hparams.dense_2_feat_in, false);
+    ml.get_key(LLM_KV_DENSE_2_FEAT_OUT, hparams.dense_2_feat_out, false);
+    ml.get_key(LLM_KV_DENSE_3_FEAT_IN, hparams.dense_3_feat_in, false);
+    ml.get_key(LLM_KV_DENSE_3_FEAT_OUT, hparams.dense_3_feat_out, false);
+
+    GGML_ASSERT((hparams.dense_2_feat_in == 0 || hparams.dense_2_feat_in == hparams.n_embd) && "dense_2_feat_in must be equal to n_embd");
+    GGML_ASSERT((hparams.dense_3_feat_out == 0 || hparams.dense_3_feat_out == hparams.n_embd) && "dense_3_feat_out must be equal to n_embd");
+
+    switch (hparams.n_layer) {
+        case 24: type = LLM_TYPE_0_3B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+    hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k()));
+
+}
+
+void llama_model_gemma_embedding::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,   "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    // Dense linear weights
+    dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.dense_2_feat_out}, TENSOR_NOT_REQUIRED);
+    dense_3_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_3_OUT, "weight"), {hparams.dense_3_feat_in, n_embd}, TENSOR_NOT_REQUIRED);
+
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
+        layer.attn_k_norm    = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM,    "weight", i), {n_embd_head_k}, 0);
+        layer.attn_q_norm    = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM,    "weight", i), {n_embd_head_k}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_gemma_embedding::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_gemma_embedding::graph::graph(const llama_model & model, const llm_graph_params & params) :
     llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_k();
 
diff --git a/src/models/gemma.cpp b/src/models/gemma.cpp
index 09d2ff8bae7f..067316700072 100644
--- a/src/models/gemma.cpp
+++ b/src/models/gemma.cpp
@@ -1,6 +1,44 @@
 #include "models.h"
 
-llm_build_gemma::llm_build_gemma(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_gemma::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+    switch (hparams.n_layer) {
+        case 18: type = LLM_TYPE_2B; break;
+        case 28: type = LLM_TYPE_7B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+   }
+}
+
+void llama_model_gemma::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_gemma::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_gemma::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     ggml_tensor * cur;
diff --git a/src/models/gemma2-iswa.cpp b/src/models/gemma2.cpp
similarity index 53%
rename from src/models/gemma2-iswa.cpp
rename to src/models/gemma2.cpp
index 0ef07df8d012..6255bf740fcf 100644
--- a/src/models/gemma2-iswa.cpp
+++ b/src/models/gemma2.cpp
@@ -1,6 +1,65 @@
 #include "models.h"
 
-llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_gemma2::load_arch_hparams(llama_model_loader & ml) {
+    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+    hparams.n_swa = 4096; // default value of gemma 2
+    uint32_t swa_period = 2;
+    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+    hparams.set_swa_pattern(swa_period);
+    hparams.attn_soft_cap = true;
+    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
+    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
+
+    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,          hparams.rope_freq_base_train_swa, false);
+    ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa, false);
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING,      hparams.f_attn_logit_softcapping, false);
+    ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING,     hparams.f_final_logit_softcapping, false);
+
+    switch (hparams.n_layer) {
+        case 26: type = LLM_TYPE_2B; break;
+        case 42: type = LLM_TYPE_9B; break;
+        case 46: type = LLM_TYPE_27B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+   }
+
+    // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L173
+    hparams.f_attention_scale = type == LLM_TYPE_27B
+        ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
+        : 1.0f / std::sqrt(float(hparams.n_embd_head_k()));
+}
+
+void llama_model_gemma2::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_gemma2::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_gemma2::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_k();
 
     ggml_tensor * cur;
diff --git a/src/models/gemma3.cpp b/src/models/gemma3.cpp
index 0da4af21c173..ee510fe38b0e 100644
--- a/src/models/gemma3.cpp
+++ b/src/models/gemma3.cpp
@@ -1,7 +1,87 @@
 #include "models.h"
 
+void llama_model_gemma3::load_arch_hparams(llama_model_loader & ml) {
+    const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+    if (found_swa && hparams.n_swa > 0) {
+        hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+        uint32_t swa_period = 6;
+        ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+        hparams.set_swa_pattern(swa_period);
+
+        ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+    } else {
+        hparams.swa_type = LLAMA_SWA_TYPE_NONE;
+    }
+
+    hparams.f_final_logit_softcapping = 0.0f;
+    ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+    switch (hparams.n_layer) {
+        case 18: type = LLM_TYPE_270M; break;
+        case 26: type = LLM_TYPE_1B; break;
+        case 32: type = LLM_TYPE_8B; break; // Rnj-1
+        case 34: type = LLM_TYPE_4B; break;
+        case 48: type = LLM_TYPE_12B; break;
+        case 62: type = LLM_TYPE_27B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+
+    // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L289
+    hparams.f_attention_scale = type == LLM_TYPE_27B
+        ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
+        : 1.0f / std::sqrt(float(hparams.n_embd_head_k()));
+}
+
+void llama_model_gemma3::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,   "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    // Dense linear weights
+    dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.dense_2_feat_out}, TENSOR_NOT_REQUIRED);
+    dense_3_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_3_OUT, "weight"), {hparams.dense_3_feat_in, n_embd}, TENSOR_NOT_REQUIRED);
+
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
+        layer.attn_k_norm    = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM,    "weight", i), {n_embd_head_k}, 0);
+        layer.attn_q_norm    = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM,    "weight", i), {n_embd_head_k}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_gemma3::build_arch_graph(const llm_graph_params & params) const {
+    if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
+        return std::make_unique<graph<true>>(*this, params);
+    } else {
+        return std::make_unique<graph<false>>(*this, params);
+    }
+}
+
 template <bool iswa>
-llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+llama_model_gemma3::graph<iswa>::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_k();
 
     ggml_tensor * cur;
@@ -141,5 +221,5 @@ llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_gr
     ggml_build_forward_expand(gf, cur);
 }
 
-template struct llm_build_gemma3<false>;
-template struct llm_build_gemma3<true>;
+template struct llama_model_gemma3::graph<false>;
+template struct llama_model_gemma3::graph<true>;
diff --git a/src/models/gemma3n-iswa.cpp b/src/models/gemma3n.cpp
similarity index 76%
rename from src/models/gemma3n-iswa.cpp
rename to src/models/gemma3n.cpp
index f8095417e06f..881499b0ca79 100644
--- a/src/models/gemma3n-iswa.cpp
+++ b/src/models/gemma3n.cpp
@@ -1,5 +1,86 @@
 #include "models.h"
 
+void llama_model_gemma3n::load_arch_hparams(llama_model_loader & ml) {
+    uint32_t swa_period = 5;
+    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+    hparams.set_swa_pattern(swa_period);
+
+    hparams.n_layer_kv_from_start     = 20;
+    hparams.f_attention_scale         = 1.0f;
+
+    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,          hparams.rope_freq_base_train_swa, false);
+    ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa);
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+    switch (hparams.n_layer) {
+        case 30: type = LLM_TYPE_E2B; break;
+        case 35: type = LLM_TYPE_E4B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_gemma3n::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    const int64_t n_altup      = hparams.n_altup;
+    const int64_t laurel_rank  = hparams.laurel_rank;
+    const int64_t n_embd_altup = hparams.n_embd_altup;
+
+    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    altup_proj        = create_tensor(tn(LLM_TENSOR_ALTUP_PROJ,        "weight"), {n_embd, n_embd, n_altup - 1}, 0);
+    altup_unembd_proj = create_tensor(tn(LLM_TENSOR_ALTUP_UNEMBD_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0);
+
+    per_layer_tok_embd   = create_tensor(tn(LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "weight"), {n_embd_altup * n_layer, n_vocab}, 0);
+    per_layer_model_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_MODEL_PROJ, "weight", 0), {n_embd, n_embd_altup * n_layer}, 0);
+    per_layer_proj_norm  = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ_NORM,  "weight", 0), {n_embd_altup}, 0);
+
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+        layer.attn_q_norm    = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM,    "weight", i), {n_embd_head_k}, 0);
+        layer.attn_k_norm    = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM,    "weight", i), {n_embd_head_k}, 0);
+        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
+
+        // altup & laurel
+        layer.per_layer_inp_gate   = create_tensor(tn(LLM_TENSOR_PER_LAYER_INP_GATE,  "weight", i), {n_embd, n_embd_altup}, 0);
+        layer.per_layer_proj       = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ,      "weight", i), {n_embd_altup, n_embd}, 0);
+        layer.per_layer_post_norm  = create_tensor(tn(LLM_TENSOR_PER_LAYER_POST_NORM, "weight", i), {n_embd}, 0);
+        layer.altup_correct_coef   = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_COEF,  "weight", i), {n_altup, n_altup}, 0);
+        layer.altup_correct_scale  = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_SCALE, "weight", i), {n_embd}, 0);
+        layer.altup_predict_coef   = create_tensor(tn(LLM_TENSOR_ALTUP_PREDICT_COEF,  "weight", i), {n_altup, n_altup * n_altup}, 0);
+        layer.altup_router         = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER,        "weight", i), {n_embd, n_altup}, 0);
+        layer.altup_router_norm    = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER_NORM,   "weight", i), {n_embd}, 0);
+        layer.laurel_l             = create_tensor(tn(LLM_TENSOR_LAUREL_L,            "weight", i), {n_embd, laurel_rank}, 0);
+        layer.laurel_r             = create_tensor(tn(LLM_TENSOR_LAUREL_R,            "weight", i), {laurel_rank, n_embd}, 0);
+        layer.laurel_post_norm     = create_tensor(tn(LLM_TENSOR_LAUREL_POST_NORM,    "weight", i), {n_embd}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_gemma3n::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
 // get 2D slice view from a 3D tensor, the idx corresponds to the 3rd dim
 static ggml_tensor * ggml_view_2d_slice(ggml_context * ctx0, ggml_tensor * x, int idx) {
     GGML_ASSERT(idx < (int) x->ne[2]);
@@ -7,7 +88,7 @@ static ggml_tensor * ggml_view_2d_slice(ggml_context * ctx0, ggml_tensor * x, in
                         idx * x->ne[0] * x->ne[1] * ggml_element_size(x));
 }
 
-llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params) :
+llama_model_gemma3n::graph::graph(const llama_model & model, const llm_graph_params & params) :
     llm_graph_context(params),
     model(model),
     n_embd_head(model.hparams.n_embd_head_k()),
@@ -229,13 +310,13 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const
     ggml_build_forward_expand(gf, cur);
 }
 
-ggml_tensor * llm_build_gemma3n_iswa::calc_magnitude(ggml_tensor * x) {
+ggml_tensor * llama_model_gemma3n::graph::calc_magnitude(ggml_tensor * x) {
     return ggml_sqrt(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, x)));
 }
 
 // equivalent to get_per_layer_inputs() in python code
 // output shape: [n_embd_altup, n_layer, n_tokens]
-ggml_tensor * llm_build_gemma3n_iswa::build_inp_per_layer() {
+ggml_tensor * llama_model_gemma3n::graph::build_inp_per_layer() {
     auto inp = std::make_unique<llm_graph_input_embd>(n_embd);
     ggml_tensor * inp_per_layer;
     float tok_embd_scale = sqrtf((float) n_embd_altup);
@@ -268,7 +349,7 @@ ggml_tensor * llm_build_gemma3n_iswa::build_inp_per_layer() {
 // equivalent to project_per_layer_inputs() in python code
 // this calculates the per-layer inputs, so the final tensor shape will have n_layer as the last dim
 // output shape: [n_embd_altup, n_tokens, n_layer]
-ggml_tensor * llm_build_gemma3n_iswa::project_per_layer_inputs(ggml_tensor * inp_batch, ggml_tensor * inp_per_layer) {
+ggml_tensor * llama_model_gemma3n::graph::project_per_layer_inputs(ggml_tensor * inp_batch, ggml_tensor * inp_per_layer) {
     const float per_layer_projection_scale = 1.0f / sqrtf((float) n_embd);
     const float per_layer_input_scale      = 1.0f / sqrtf(2.0f);
 
@@ -291,7 +372,7 @@ ggml_tensor * llm_build_gemma3n_iswa::project_per_layer_inputs(ggml_tensor * inp
 
 // input cur shape: [n_altup, n_tokens]
 // output    shape: [n_altup, n_tokens]
-ggml_tensor * llm_build_gemma3n_iswa::laurel(ggml_tensor * cur, int il) {
+ggml_tensor * llama_model_gemma3n::graph::laurel(ggml_tensor * cur, int il) {
     ggml_tensor * tmp = cur;
     tmp               = build_lora_mm(model.layers[il].laurel_l, tmp);
     tmp               = build_lora_mm(model.layers[il].laurel_r, tmp);
@@ -303,7 +384,7 @@ ggml_tensor * llm_build_gemma3n_iswa::laurel(ggml_tensor * cur, int il) {
 
 // input x shape: [n_embd, n_tokens]
 // output  shape: [n_embd, n_tokens]
-ggml_tensor * llm_build_gemma3n_iswa::gaussian_topk(ggml_tensor * x) {
+ggml_tensor * llama_model_gemma3n::graph::gaussian_topk(ggml_tensor * x) {
     ggml_tensor * mean = ggml_mean(ctx0, x);
     ggml_tensor * std  = ggml_sqrt(ctx0, ggml_scale(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x, mean))),
                                                     1.0f / (float) (x->ne[0] - 1)));
@@ -318,7 +399,7 @@ ggml_tensor * llm_build_gemma3n_iswa::gaussian_topk(ggml_tensor * x) {
 // equivalent to compute_router_modalities() in python code
 // input x shape: [n_embd,  n_tokens]
 // output  shape: [n_altup, n_tokens]
-ggml_tensor * llm_build_gemma3n_iswa::altup_compute_router_modalities(ggml_tensor * x, int il) {
+ggml_tensor * llama_model_gemma3n::graph::altup_compute_router_modalities(ggml_tensor * x, int il) {
     ggml_tensor * router_inputs = build_norm(x, model.layers[il].altup_router_norm, NULL, LLM_NORM_RMS, il);
 
     // router_input_scale
@@ -330,7 +411,7 @@ ggml_tensor * llm_build_gemma3n_iswa::altup_compute_router_modalities(ggml_tenso
 
 // input cur shape: [n_embd, n_tokens, n_altup]
 // output    shape: [n_embd, n_tokens, n_altup]
-ggml_tensor * llm_build_gemma3n_iswa::altup_predict(ggml_tensor * cur, int il) {
+ggml_tensor * llama_model_gemma3n::graph::altup_predict(ggml_tensor * cur, int il) {
     ggml_tensor * activated  = ggml_view_2d_slice(ctx0, cur, i_altup_act);      // [n_embd, n_tokens]
     ggml_tensor * modalities = altup_compute_router_modalities(activated, il);  // [n_altup, n_tokens]
     cb(modalities, "modalities", il);
@@ -355,7 +436,7 @@ ggml_tensor * llm_build_gemma3n_iswa::altup_predict(ggml_tensor * cur, int il) {
 // input predictions       shape: [n_embd, n_tokens, n_altup]
 // input activated         shape: [n_embd, n_tokens]
 // output                  shape: [n_embd, n_tokens, n_altup]
-ggml_tensor * llm_build_gemma3n_iswa::altup_correct(ggml_tensor * predictions, ggml_tensor * activated, int il) {
+ggml_tensor * llama_model_gemma3n::graph::altup_correct(ggml_tensor * predictions, ggml_tensor * activated, int il) {
     ggml_tensor * modalities = altup_compute_router_modalities(activated, il);  // [n_altup, n_tokens]
     cb(modalities, "modalities", il);
 
diff --git a/src/models/gemma4-iswa.cpp b/src/models/gemma4.cpp
similarity index 64%
rename from src/models/gemma4-iswa.cpp
rename to src/models/gemma4.cpp
index c7fb7747414d..5026b0ac2ad4 100644
--- a/src/models/gemma4-iswa.cpp
+++ b/src/models/gemma4.cpp
@@ -1,5 +1,134 @@
 #include "models.h"
 
+void llama_model_gemma4::load_arch_hparams(llama_model_loader & ml) {
+    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
+
+    uint32_t n_kv_shared_layers = 0;
+    ml.get_key(LLM_KV_ATTENTION_SHARED_KV_LAYERS, n_kv_shared_layers, false);
+
+    hparams.n_layer_kv_from_start = hparams.n_layer - (int32_t)n_kv_shared_layers;
+    hparams.f_attention_scale     = 1.0f; // Gemma4 uses self.scaling = 1.0 (no pre-attn scaling)
+
+    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,          hparams.rope_freq_base_train_swa, false);
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp, false);
+    ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa);
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    ml.get_key(LLM_KV_EMBEDDING_LENGTH_PER_LAYER,  hparams.n_embd_per_layer);
+    ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_SWA,    hparams.n_embd_head_k_swa);
+    ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_SWA,  hparams.n_embd_head_v_swa);
+    ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING,     hparams.f_final_logit_softcapping, false);
+
+    switch (hparams.n_layer) {
+        case 30: type = LLM_TYPE_26B_A4B; break;
+        case 35: type = LLM_TYPE_E2B; break;
+        case 42: type = LLM_TYPE_E4B; break;
+        case 60: type = LLM_TYPE_31B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_gemma4::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    const uint32_t n_embd_per_layer = hparams.n_embd_per_layer;
+    const int64_t  n_ff_exp         = hparams.n_ff_exp;
+
+    if (n_embd_head_k != n_embd_head_v) {
+        throw std::runtime_error("Gemma 4 requires n_embd_head_k == n_embd_head_v");
+    }
+    if (hparams.n_embd_head_k_swa != hparams.n_embd_head_v_swa) {
+        throw std::runtime_error("Gemma 4 requires n_embd_head_k_swa == n_embd_head_v_swa");
+    }
+
+    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    if (n_embd_per_layer > 0) {
+        per_layer_tok_embd   = create_tensor(tn(LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "weight"),    {n_embd_per_layer * n_layer, n_vocab}, 0);
+        per_layer_model_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_MODEL_PROJ, "weight", 0), {n_embd, n_embd_per_layer * n_layer}, 0);
+        per_layer_proj_norm  = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ_NORM,  "weight", 0), {n_embd_per_layer}, 0);
+    }
+
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+
+    int rope_freqs_flag = 0;
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+        const int64_t n_head      = hparams.n_head(i);
+        const int64_t n_embd_head = hparams.n_embd_head_k(i);
+        const int64_t n_embd_k    = hparams.n_embd_k_gqa(i);
+        const int64_t n_embd_v    = hparams.n_embd_v_gqa(i);
+        const int     kv_flags    = hparams.has_kv(i) ? 0 : TENSOR_NOT_REQUIRED;
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        // note: use_alternative_attention (v_proj is optional, if it's not present, use k_proj)
+        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head * n_head}, 0);
+        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k}, kv_flags);
+        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v}, TENSOR_NOT_REQUIRED);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head * n_head, n_embd}, 0);
+
+        layer.attn_q_norm    = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM,    "weight", i), {n_embd_head}, 0);
+        layer.attn_k_norm    = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM,    "weight", i), {n_embd_head}, kv_flags);
+        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
+
+        layer.out_scale = create_tensor(tn(LLM_TENSOR_LAYER_OUT_SCALE, "weight", i), {1u}, TENSOR_NOT_REQUIRED);
+
+        if (!hparams.is_swa(i)) {
+            // full_attention layers use rope_freqs for proportional rope
+            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_embd_head/2}, rope_freqs_flag);
+            rope_freqs_flag = TENSOR_DUPLICATED;
+        }
+
+        // handle use_double_wide_mlp
+        int64_t n_ff_cur = hparams.n_ff(i);
+
+        // for expert layers, we use normal FFN as shared expert (same as python code)
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff_cur}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff_cur}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff_cur, n_embd}, 0);
+        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
+
+        // MoE router
+        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
+        bool has_expert = layer.ffn_gate_inp != nullptr;
+
+        // norm
+        if (has_expert) {
+            layer.ffn_gate_inp_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "scale", i), {n_embd}, 0);
+
+            layer.ffn_pre_norm_2  = create_tensor(tn(LLM_TENSOR_FFN_PRE_NORM_2,  "weight", i), {n_embd}, 0);
+            layer.ffn_post_norm_1 = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM_1, "weight", i), {n_embd}, 0);
+            layer.ffn_post_norm_2 = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM_2, "weight", i), {n_embd}, 0);
+
+            // MoE FFN
+            layer.ffn_gate_up_exps  = create_tensor(tn(LLM_TENSOR_FFN_GATE_UP_EXPS,  "weight", i), {n_embd, n_ff_exp * 2, n_expert}, 0);
+            layer.ffn_down_exps     = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS,     "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
+
+            // per-expert scale will be loaded as down_exps_s at the end of the current switch case
+        }
+
+        // per-layer embeddings
+        if (n_embd_per_layer > 0) {
+            layer.per_layer_inp_gate   = create_tensor(tn(LLM_TENSOR_PER_LAYER_INP_GATE,  "weight", i), {n_embd, n_embd_per_layer}, 0);
+            layer.per_layer_proj       = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ,      "weight", i), {n_embd_per_layer, n_embd}, 0);
+            layer.per_layer_post_norm  = create_tensor(tn(LLM_TENSOR_PER_LAYER_POST_NORM, "weight", i), {n_embd}, 0);
+        }
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_gemma4::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
 // get 2D slice view from a 3D tensor, the idx corresponds to the 3rd dim
 static ggml_tensor * ggml_view_2d_slice(ggml_context * ctx0, ggml_tensor * x, int idx) {
     GGML_ASSERT(idx < (int) x->ne[2]);
@@ -7,7 +136,7 @@ static ggml_tensor * ggml_view_2d_slice(ggml_context * ctx0, ggml_tensor * x, in
                         idx * x->ne[0] * x->ne[1] * ggml_element_size(x));
 }
 
-llm_build_gemma4_iswa::llm_build_gemma4_iswa(const llama_model & model, const llm_graph_params & params) :
+llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_params & params) :
         llm_graph_context(params),
         model(model),
         n_embd_per_layer(model.hparams.n_embd_per_layer) {
@@ -261,7 +390,7 @@ llm_build_gemma4_iswa::llm_build_gemma4_iswa(const llama_model & model, const ll
 
 // equivalent to get_per_layer_inputs() in python code
 // output shape: [n_embd_per_layer, n_layer, n_tokens]
-ggml_tensor * llm_build_gemma4_iswa::build_inp_per_layer() {
+ggml_tensor * llama_model_gemma4::graph::build_inp_per_layer() {
     auto inp = std::make_unique<llm_graph_input_embd>(n_embd);
 
     ggml_tensor * inp_per_layer;
@@ -299,7 +428,7 @@ ggml_tensor * llm_build_gemma4_iswa::build_inp_per_layer() {
 // inp_batch     shape: [n_embd, n_tokens]
 // inp_per_layer shape: [n_embd_per_layer, n_layer, n_tokens] (from build_inp_per_layer)
 // output shape: [n_embd_per_layer, n_tokens, n_layer]
-ggml_tensor * llm_build_gemma4_iswa::project_per_layer_inputs(ggml_tensor * inp_batch, ggml_tensor * inp_per_layer) {
+ggml_tensor * llama_model_gemma4::graph::project_per_layer_inputs(ggml_tensor * inp_batch, ggml_tensor * inp_per_layer) {
     const float per_layer_projection_scale = 1.0f / sqrtf((float) n_embd);
     const float per_layer_input_scale      = 1.0f / sqrtf(2.0f);
 
diff --git a/src/models/glm-dsa.cpp b/src/models/glm-dsa.cpp
new file mode 100644
index 000000000000..af2b55ef563d
--- /dev/null
+++ b/src/models/glm-dsa.cpp
@@ -0,0 +1,155 @@
+#include "models.h"
+
+void llama_model_glm_dsa::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,     hparams.n_ff_exp);
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,    hparams.f_norm_rms_eps);
+    ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
+
+    // MoE parameters
+    ml.get_key(LLM_KV_EXPERT_COUNT,                hparams.n_expert);
+    ml.get_key(LLM_KV_EXPERT_USED_COUNT,           hparams.n_expert_used);
+    ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
+    ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
+
+    // deepseek MLA parameters
+    ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK,      hparams.n_lora_q);
+    ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,     hparams.n_lora_kv);
+    ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA,   hparams.n_embd_head_k_mla_impl, false);
+    ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl, false);
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+    ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,        hparams.n_expert_shared);
+
+    // DSA parameters
+    ml.get_key(LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, hparams.indexer_n_head);
+    ml.get_key(LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, hparams.indexer_head_size);
+    ml.get_key(LLM_KV_ATTENTION_INDEXER_TOP_K,      hparams.indexer_top_k);
+
+    // Expert gating function (GLM-4.5 uses sigmoid)
+    ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
+    if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
+        hparams.expert_gating_func =  LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
+    }
+
+    // NextN/MTP parameters
+    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,        hparams.nextn_predict_layers, false);
+    GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
+
+    // TODO: when MTP is implemented, this should probably be updated if needed
+    hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
+
+    switch (hparams.n_layer) {
+        case 79: type = LLM_TYPE_744B_A40B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_glm_dsa::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+    const int64_t n_expert_shared = hparams.n_expert_shared;
+
+    const bool is_mla = hparams.is_mla();
+    if (!is_mla) {
+        throw std::runtime_error("GLM_DSA architecture requires MLA");
+    }
+
+    // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
+    const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
+    const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
+
+    const int64_t n_embd_head_qk_rope = hparams.n_rot();
+    const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
+
+    const int64_t q_lora_rank  = hparams.n_lora_q;
+    const int64_t kv_lora_rank = hparams.n_lora_kv;
+
+    const int64_t n_ff_exp        = hparams.n_ff_exp;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    // try to load output.weight, if not found, use token_embd (tied embeddings)
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+    if (!output) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        int flags = 0;
+        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+            // skip all tensors in the NextN layers
+            // TODO @ngxson : TENSOR_NOT_REQUIRED was a hack, need to remove it later
+            flags |= TENSOR_SKIP | TENSOR_NOT_REQUIRED;
+        }
+
+        auto & layer = layers[i];
+
+        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
+        layer.attn_q_a_norm  = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, flags);
+        layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, flags);
+
+        layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, flags);
+        layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, flags);
+
+        layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, flags);
+
+        // note: only old legacy GGUF files will have the unsplit wkv_b tensor in
+        layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, flags);
+        layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, flags);
+
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, flags);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
+
+        // DSA indexer
+        layer.indexer_k_norm   = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM,   "weight", i), {hparams.indexer_head_size}, flags);
+        layer.indexer_k_norm_b = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM,   "bias",   i), {hparams.indexer_head_size}, flags);
+        layer.indexer_proj     = create_tensor(tn(LLM_TENSOR_INDEXER_PROJ,     "weight", i), {n_embd, hparams.indexer_n_head}, flags);
+        layer.indexer_attn_k   = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_K,   "weight", i), {n_embd, hparams.indexer_head_size}, flags);
+        layer.indexer_attn_q_b = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.indexer_n_head * hparams.indexer_head_size}, flags);
+        if (i < (int) hparams.n_layer_dense_lead) {
+            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, flags);
+            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, flags);
+            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, flags);
+        } else {
+            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
+            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
+
+            if (n_expert == 0) {
+                throw std::runtime_error("n_expert must be > 0");
+            }
+            if (n_expert_used == 0) {
+                throw std::runtime_error("n_expert_used must be > 0");
+            }
+
+            // MoE branch
+            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, flags);
+            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, flags);
+            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, flags);
+
+            // Shared expert branch
+            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags);
+            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, flags);
+            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags);
+        }
+
+        // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
+        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+            layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
+            layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
+            layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
+
+            // Optional tensors
+            layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
+            layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
+            layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED);
+        }
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_glm_dsa::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
diff --git a/src/models/glm4-moe.cpp b/src/models/glm4-moe.cpp
index 8d4f4a015533..45886b51ac16 100644
--- a/src/models/glm4-moe.cpp
+++ b/src/models/glm4-moe.cpp
@@ -1,6 +1,139 @@
 #include "models.h"
 
-llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_glm4_moe::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,     hparams.n_ff_exp);
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,    hparams.f_norm_rms_eps);
+    ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
+
+    // MoE parameters
+    ml.get_key(LLM_KV_EXPERT_COUNT,                hparams.n_expert);
+    ml.get_key(LLM_KV_EXPERT_USED_COUNT,           hparams.n_expert_used);
+    ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
+    ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
+
+    // Expert gating function (GLM-4.5 uses sigmoid)
+    ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
+    if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
+        hparams.expert_gating_func =  LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
+    }
+
+    // NextN/MTP parameters
+    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,        hparams.nextn_predict_layers, false);
+    GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
+
+    // TODO: when MTP is implemented, this should probably be updated if needed
+    hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
+
+    switch (hparams.n_layer) {
+        case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
+        case 48: type = LLM_TYPE_102B_A12B; break; // Solar Open
+        case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_glm4_moe::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+    const int64_t n_expert_shared = hparams.n_expert_shared;
+
+
+    GGML_ASSERT(hparams.n_expert > 0 && "n_expert must be > 0 for GLM4_MOE MoE layers");
+    GGML_ASSERT(hparams.n_expert_used > 0 && "n_expert_used must be > 0 for GLM4_MOE MoE layers");
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
+    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
+    }
+
+    // Load ALL tensors including NextN layer to satisfy total tensor count
+    // but only PROCESS up to last layer (skipping final NextN layer) in forward pass
+    for (int i = 0; i < n_layer; ++i) {
+        int flags = 0;
+        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+            // skip all tensors in the NextN layers
+            flags |= TENSOR_SKIP;
+        }
+
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, flags);
+
+        // GLM-style attention with bias terms
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, flags);
+
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
+
+        // K/Q norm tensors (optional for GLM-4.5 355B variant)
+        layer.attn_q_norm = create_tensor(
+            tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
+        layer.attn_k_norm = create_tensor(
+            tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
+
+        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags);
+
+        // Check if this layer uses MoE or dense FFN based on n_layer_dense_lead
+        // GLM 4.5 uses hybrid architecture: layer 0 is dense, layers 1+ are MoE
+        const bool use_moe = (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead);
+
+        if (use_moe) {
+            // MoE layers
+            layer.ffn_gate_inp =
+                create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags);
+            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags);
+
+            // MoE branch
+            const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
+
+            layer.ffn_gate_exps = create_tensor(
+                tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
+            layer.ffn_down_exps = create_tensor(
+                tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags);
+            layer.ffn_up_exps = create_tensor(
+                tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
+
+            // Shared expert
+            if (n_expert_shared > 0) {
+                const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
+                layer.ffn_gate_shexp = create_tensor(
+                    tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
+                layer.ffn_down_shexp = create_tensor(
+                    tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags);
+                layer.ffn_up_shexp = create_tensor(
+                    tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
+            }
+        } else {
+            // Dense layers (first k layers) - GLM uses separate gate/up projections
+            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags);
+            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags);
+            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), { n_embd, n_ff }, flags);
+        }
+
+        // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
+        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+            layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
+            layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
+            layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
+
+            // Optional tensors
+            layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
+            layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
+            layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED);
+        }
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_glm4_moe::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_glm4_moe::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/glm4.cpp b/src/models/glm4.cpp
index f0bfda393fa1..d6ef76e26d6e 100644
--- a/src/models/glm4.cpp
+++ b/src/models/glm4.cpp
@@ -1,6 +1,78 @@
 #include "models.h"
 
-llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_glm4::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,    hparams.f_norm_rms_eps);
+    ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
+
+    // NextN/MTP parameters (GLM-OCR)
+    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
+    GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
+
+    // TODO: when MTP is implemented, this should probably be updated if needed
+    hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
+
+    switch (hparams.n_layer) {
+        case 17: type = LLM_TYPE_1B; break; // GLM-OCR
+        case 40: type = LLM_TYPE_9B; break;
+        case 61: type = LLM_TYPE_32B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_glm4::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        int flags = 0;
+        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+            // skip all tensors in the NextN layers
+            flags |= TENSOR_SKIP;
+        }
+
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, flags);
+
+        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, flags);
+
+        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, flags);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, flags);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff * 2}, flags);
+
+        layer.ffn_post_norm  = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, flags);
+
+        // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
+        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+            layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
+            layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
+            layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
+
+            // Optional tensors
+            layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
+            layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
+            layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED);
+        }
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_glm4::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_glm4::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/gpt2.cpp b/src/models/gpt2.cpp
index f8dc53eb723f..ba49c31b56b8 100644
--- a/src/models/gpt2.cpp
+++ b/src/models/gpt2.cpp
@@ -1,6 +1,60 @@
 #include "models.h"
 
-llm_build_gpt2::llm_build_gpt2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_gpt2::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+    switch (hparams.n_layer) {
+        case 12: type = LLM_TYPE_SMALL; break;
+        case 24: type = LLM_TYPE_MEDIUM; break;
+        case 36: type = LLM_TYPE_LARGE; break;
+        case 48: type = LLM_TYPE_XL; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_gpt2::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+    pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,   "weight"), {n_embd, n_ctx_train}, 0);
+
+    // output
+    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, 0);
+        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, 0);
+
+        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+        layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
+
+        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
+
+        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
+
+        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
+
+        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_gpt2::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_gpt2::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/gptneox.cpp b/src/models/gptneox.cpp
index 0016ddede43d..33ebe2d8800e 100644
--- a/src/models/gptneox.cpp
+++ b/src/models/gptneox.cpp
@@ -1,6 +1,89 @@
 #include "models.h"
 
-llm_build_gptneox::llm_build_gptneox(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_gptneox::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+    ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL,   hparams.use_par_res);
+    switch (hparams.n_layer) {
+        case 6:
+            switch (hparams.n_ff()) {
+                case 512:  type = LLM_TYPE_14M; break;
+                case 2048: type = LLM_TYPE_70M; break;
+                default:   type = LLM_TYPE_UNKNOWN;
+            } break;
+        case 12:
+            switch (hparams.n_ff()) {
+                case 3072: type = LLM_TYPE_160M; break;
+                default: type = LLM_TYPE_UNKNOWN;
+            } break;
+        case 16:
+            switch (hparams.n_ff()) {
+                case 8192: type = LLM_TYPE_1B; break;
+                default: type = LLM_TYPE_UNKNOWN;
+            } break;
+        case 24:
+            switch (hparams.n_ff()) {
+                case 4096: type = LLM_TYPE_410M; break;
+                case 8192: type = LLM_TYPE_1_4B; break;
+                default: type = LLM_TYPE_UNKNOWN;
+            } break;
+        case 32:
+            switch (hparams.n_ff()) {
+                case 10240: type = LLM_TYPE_2_8B; break;
+                case 16384: type = LLM_TYPE_6_9B; break;
+                default: type = LLM_TYPE_UNKNOWN;
+            } break;
+        case 36:
+            switch (hparams.n_ff()) {
+                case 20480: type = LLM_TYPE_12B; break;
+                default: type = LLM_TYPE_UNKNOWN;
+            } break;
+        case 44:
+            switch (hparams.n_ff()) {
+                case 24576: type = LLM_TYPE_20B; break;
+                default: type = LLM_TYPE_UNKNOWN;
+            } break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_gptneox::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
+
+        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+        layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
+
+        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
+
+        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
+
+        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
+
+        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_gptneox::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_gptneox::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/granite-hybrid.cpp b/src/models/granite-hybrid.cpp
index e983742bef58..12e4790ae249 100644
--- a/src/models/granite-hybrid.cpp
+++ b/src/models/granite-hybrid.cpp
@@ -1,6 +1,137 @@
 #include "models.h"
 
-llm_build_granite_hybrid::llm_build_granite_hybrid(const llama_model & model, const llm_graph_params & params) :
+void llama_model_granite_hybrid::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    ml.get_key(LLM_KV_LOGIT_SCALE,                 hparams.f_logit_scale, /* required */ false);
+    ml.get_key(LLM_KV_RESIDUAL_SCALE,              hparams.f_residual_scale, /* required */ false);
+    ml.get_key(LLM_KV_EMBEDDING_SCALE,             hparams.f_embedding_scale, /* required */ false);
+    ml.get_key(LLM_KV_ATTENTION_SCALE,             hparams.f_attention_scale, /* required */ false);
+
+    ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
+    ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
+    ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
+    ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
+    ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
+
+    // Granite uses rope_finetuned as a switch for rope, so default to true
+    bool rope_finetuned = true;
+    ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
+    hparams.rope_finetuned = rope_finetuned;
+
+    // A layer is recurrent IFF the n_head_kv value is set to 0
+    for (uint32_t i = 0; i < hparams.n_layer; ++i) {
+        hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
+    }
+
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+    switch (hparams.n_embd) {
+        case 768: type = LLM_TYPE_350M; break;
+        case 1536: type = (hparams.n_ff() == 512 ? LLM_TYPE_7B_A1B : LLM_TYPE_1B); break;
+        case 2048: case 2560: type = LLM_TYPE_3B; break;
+        case 4096: type = LLM_TYPE_32B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+
+    // For Granite MoE Shared
+    ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
+}
+
+void llama_model_granite_hybrid::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    // mamba2 Mixer SSM params
+    // NOTE: int64_t for tensor dimensions
+    const int64_t d_conv     = hparams.ssm_d_conv;
+    const int64_t d_inner    = hparams.ssm_d_inner;
+    const int64_t d_state    = hparams.ssm_d_state;
+    const int64_t n_ssm_head = hparams.ssm_dt_rank;
+    const int64_t n_group    = hparams.ssm_n_group;
+    const int64_t d_in_proj  = 2*d_inner + 2*n_group*d_state + n_ssm_head;
+
+    // only an expansion factor of 2 is supported for now
+    GGML_ASSERT(2 * n_embd == d_inner);
+
+    // embeddings
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    {
+        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+        output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+        // if output is NULL, init from the input tok embed, duplicated to allow offloading
+        if (output == NULL) {
+            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+        }
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        // norm
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        if (hparams.is_recurrent(i)) {
+            // ssm layers
+            layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
+
+            layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
+            layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
+
+            layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
+
+            // no "weight" suffix for these
+            layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
+            layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
+
+            layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
+
+            // out_proj
+            layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
+        } else {
+            // attention layers (with optional bias)
+            const int64_t n_head_i = hparams.n_head(i);
+            const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
+            const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
+            create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head_i, n_embd_k_gqa_i, n_embd_v_gqa_i, 0);
+            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
+            layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+        }
+
+        // feed forward (w/ optional biases)
+        if (n_expert > 0) {
+            // MoE FFN
+            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
+            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, TENSOR_NOT_REQUIRED);
+            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
+            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
+
+            // For Granite MoE Shared
+            if (hparams.n_ff_shexp > 0) {
+                layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
+                layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
+                layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
+            }
+        } else {
+            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+            layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+        }
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_granite_hybrid::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_granite_hybrid::graph::graph(const llama_model & model, const llm_graph_params & params) :
     llm_build_mamba_base(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
@@ -67,7 +198,7 @@ llm_build_granite_hybrid::llm_build_granite_hybrid(const llama_model & model, co
     ggml_build_forward_expand(gf, cur);
 }
 
-ggml_tensor * llm_build_granite_hybrid::build_attention_layer(ggml_tensor *             cur,
+ggml_tensor * llama_model_granite_hybrid::graph::build_attention_layer(ggml_tensor *             cur,
                                                               ggml_tensor *             inp_pos,
                                                               llm_graph_input_attn_kv * inp_attn,
                                                               const llama_model &       model,
@@ -98,7 +229,7 @@ ggml_tensor * llm_build_granite_hybrid::build_attention_layer(ggml_tensor *
     return cur;
 }
 
-ggml_tensor * llm_build_granite_hybrid::build_layer_ffn(ggml_tensor *       cur,
+ggml_tensor * llama_model_granite_hybrid::graph::build_layer_ffn(ggml_tensor *       cur,
                                                         ggml_tensor *       inpSA,
                                                         const llama_model & model,
                                                         const int           il) {
diff --git a/src/models/granite-moe.cpp b/src/models/granite-moe.cpp
new file mode 100644
index 000000000000..0d89bc1f3404
--- /dev/null
+++ b/src/models/granite-moe.cpp
@@ -0,0 +1,89 @@
+#include "models.h"
+
+void llama_model_granite_moe::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    ml.get_key(LLM_KV_LOGIT_SCALE,                 hparams.f_logit_scale);
+    ml.get_key(LLM_KV_RESIDUAL_SCALE,              hparams.f_residual_scale, false);
+    ml.get_key(LLM_KV_EMBEDDING_SCALE,             hparams.f_embedding_scale, false);
+    ml.get_key(LLM_KV_ATTENTION_SCALE,             hparams.f_attention_scale, false);
+
+    // Granite uses rope_finetuned as a switch for rope, so default to true
+    bool rope_finetuned = true;
+    ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
+    hparams.rope_finetuned = rope_finetuned;
+
+    switch (hparams.n_layer) {
+        case 32: type = LLM_TYPE_3B; break;
+        case 40: type = LLM_TYPE_3B; break;
+        // Add additional layer/vocab/etc checks here for other model sizes
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+
+    // For Granite MoE Shared
+    ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
+}
+
+void llama_model_granite_moe::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+        // optional bias tensors
+        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
+            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+        }
+        else {
+            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+        }
+
+        if (n_expert == 0) {
+            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+
+            // optional MLP bias
+            layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+        } else {
+            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
+            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, TENSOR_NOT_REQUIRED);
+            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
+            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
+
+            // For Granite MoE Shared
+            if (hparams.n_ff_shexp > 0) {
+                layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
+                layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
+                layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
+            }
+        }
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_granite_moe::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
diff --git a/src/models/granite.cpp b/src/models/granite.cpp
index 6ea902852251..5e7c7b68181f 100644
--- a/src/models/granite.cpp
+++ b/src/models/granite.cpp
@@ -1,6 +1,93 @@
 #include "models.h"
 
-llm_build_granite::llm_build_granite(
+void llama_model_granite::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    ml.get_key(LLM_KV_LOGIT_SCALE,                 hparams.f_logit_scale);
+    ml.get_key(LLM_KV_RESIDUAL_SCALE,              hparams.f_residual_scale, false);
+    ml.get_key(LLM_KV_EMBEDDING_SCALE,             hparams.f_embedding_scale, false);
+    ml.get_key(LLM_KV_ATTENTION_SCALE,             hparams.f_attention_scale, false);
+
+    // Granite uses rope_finetuned as a switch for rope, so default to true
+    bool rope_finetuned = true;
+    ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
+    hparams.rope_finetuned = rope_finetuned;
+
+    switch (hparams.n_layer) {
+        case 32: type = LLM_TYPE_3B; break;
+        case 40: type = LLM_TYPE_3B; break;
+        // Add additional layer/vocab/etc checks here for other model sizes
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+
+    // For Granite MoE Shared
+    ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
+}
+
+void llama_model_granite::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+        // optional bias tensors
+        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
+            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+        }
+        else {
+            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+        }
+
+        if (n_expert == 0) {
+            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+
+            // optional MLP bias
+            layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+        } else {
+            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
+            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, TENSOR_NOT_REQUIRED);
+            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
+            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
+
+            // For Granite MoE Shared
+            if (hparams.n_ff_shexp > 0) {
+                layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
+                layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
+                layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
+            }
+        }
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_granite::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_granite::graph::graph(
     const llama_model & model,
     const llm_graph_params & params)
     : llm_graph_context(params) {
@@ -68,7 +155,7 @@ llm_build_granite::llm_build_granite(
     ggml_build_forward_expand(gf, cur);
 }
 
-ggml_tensor * llm_build_granite::build_attention_layer(
+ggml_tensor * llama_model_granite::graph::build_attention_layer(
           ggml_tensor             * cur,
           ggml_tensor             * inp_pos,
           llm_graph_input_attn_kv * inp_attn,
@@ -107,7 +194,7 @@ ggml_tensor * llm_build_granite::build_attention_layer(
     return cur;
 }
 
-ggml_tensor * llm_build_granite::build_layer_ffn(
+ggml_tensor * llama_model_granite::graph::build_layer_ffn(
           ggml_tensor       * cur,
           ggml_tensor       * inpSA,
     const llama_model       & model,
diff --git a/src/models/grok.cpp b/src/models/grok.cpp
index b8f35afdc032..0bc49d00206c 100644
--- a/src/models/grok.cpp
+++ b/src/models/grok.cpp
@@ -1,6 +1,89 @@
 #include "models.h"
 
-llm_build_grok::llm_build_grok(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_grok::load_arch_hparams(llama_model_loader & ml) {
+    // defaults for old GGUFs
+    hparams.yarn_beta_fast = 8.0f;
+    hparams.f_logit_scale = 0.5773502691896257f;
+    hparams.f_embedding_scale = 78.38367176906169f;
+    hparams.f_attn_out_scale = 0.08838834764831845f;
+    hparams.f_attn_logit_softcapping = 30.0f;
+    hparams.f_router_logit_softcapping = 30.0f;
+    // no final_logit_softcapping in grok-1
+    hparams.f_final_logit_softcapping = 0.0f;
+
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,  hparams.f_norm_rms_eps);
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,   hparams.n_ff_exp, false);
+    ml.get_key(LLM_KV_LOGIT_SCALE,                  hparams.f_logit_scale, false);
+    ml.get_key(LLM_KV_EMBEDDING_SCALE,              hparams.f_embedding_scale, false);
+    ml.get_key(LLM_KV_ATTENTION_OUTPUT_SCALE,       hparams.f_attn_out_scale, false);
+    ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING,       hparams.f_attn_logit_softcapping, false);
+    ml.get_key(LLM_KV_ROUTER_LOGIT_SOFTCAPPING,     hparams.f_router_logit_softcapping, false);
+    ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING,      hparams.f_final_logit_softcapping, false);
+
+    ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH,  hparams.attn_temp_length, false);
+    ml.get_key(LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR,  hparams.yarn_ext_factor, false);
+    ml.get_key(LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, hparams.yarn_attn_factor, false);
+    ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST,   hparams.yarn_beta_fast, false);
+    ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW,   hparams.yarn_beta_slow, false);
+
+    switch (hparams.n_layer) {
+        case 64: type = LLM_TYPE_314B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_grok::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    if (n_expert == 0) {
+        throw std::runtime_error(arch_name() + " model cannot have zero experts");
+    }
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff/* / n_expert_used*/; // grok-1 n_ff_exp == n_ff
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+        layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff,   n_embd}, TENSOR_NOT_REQUIRED);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
+
+        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
+        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
+        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd,   n_expert}, 0);
+        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff_exp, n_expert}, 0);
+
+        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+        if (!layer.ffn_post_norm) {
+            layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
+        }
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_grok::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_grok::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/grovemoe.cpp b/src/models/grovemoe.cpp
index 151108a2a71f..feef815165be 100644
--- a/src/models/grovemoe.cpp
+++ b/src/models/grovemoe.cpp
@@ -1,6 +1,70 @@
 #include "models.h"
 
-llm_build_grovemoe::llm_build_grovemoe(const llama_model & model, const llm_graph_params & params) :
+void llama_model_grovemoe::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
+    ml.get_key(LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH,  hparams.n_ff_chexp, false);
+    ml.get_key(LLM_KV_EXPERT_GROUP_SCALE,                hparams.expert_group_scale);
+    ml.get_key(LLM_KV_EXPERTS_PER_GROUP,                 hparams.n_group_experts);
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
+
+    switch (hparams.n_layer) {
+        case 48: type = LLM_TYPE_30B_A3B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_grovemoe::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for GROVEMOE");
+    GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for GROVEMOE");
+    GGML_ASSERT(hparams.n_group_experts > 0 && "n_group_experts must be > 0 for GROVEMOE");
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_gqa, n_embd_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+
+        // MoE branch
+        const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
+        const int64_t n_ff_chexp = hparams.n_ff_chexp ? hparams.n_ff_chexp : n_embd_head_k;
+        const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;
+
+        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
+        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+
+        layer.ffn_gate_chexps = create_tensor(tn(LLM_TENSOR_FFN_GATE_CHEXPS, "weight", i), {  n_embd, n_ff_chexp, n_chunk_expert}, 0);
+        layer.ffn_down_chexps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_CHEXPS, "weight", i), {n_ff_chexp,   n_embd, n_chunk_expert}, 0);
+        layer.ffn_up_chexps   = create_tensor(tn(LLM_TENSOR_FFN_UP_CHEXPS,   "weight", i), {  n_embd, n_ff_chexp, n_chunk_expert}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_grovemoe::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_grovemoe::graph::graph(const llama_model & model, const llm_graph_params & params) :
     llm_graph_context(params) {
     const int64_t n_embd_head    = hparams.n_embd_head_v();
     const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;
diff --git a/src/models/hunyuan-dense.cpp b/src/models/hunyuan-dense.cpp
index 1cd85d6d9d4d..c137bd37c027 100644
--- a/src/models/hunyuan-dense.cpp
+++ b/src/models/hunyuan-dense.cpp
@@ -1,132 +1,6 @@
 #include "models.h"
 
-llm_build_hunyuan_dense::llm_build_hunyuan_dense(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v();
-
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
-    GGML_ASSERT(n_embd_head == n_rot);
-
-    const bool use_mrope = hparams.use_mrope();
-
-    int sections[4];
-    std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
-
-    ggml_tensor * cur;
-    ggml_tensor * inpL;
-
-    inpL = build_inp_embd(model.tok_embd);
-
-    // inp_pos - contains the positions
-    ggml_tensor * inp_pos = build_inp_pos();
-
-    auto * inp_attn = build_attn_inp_kv();
-
-    const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
-
-    ggml_tensor * inp_out_ids = build_inp_out_ids();
-
-    for (int il = 0; il < n_layer; ++il) {
-        ggml_tensor * inpSA = inpL;
-
-        // norm
-        cur = build_norm(inpL,
-                model.layers[il].attn_norm, NULL,
-                LLM_NORM_RMS, il);
-        cb(cur, "attn_norm", il);
-        // self-attention
-        {
-            // rope freq factors for llama3; may return nullptr for llama2 and other models
-            ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
-
-            // compute Q and K and RoPE them
-            auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
-                    n_embd_head, n_head, n_head_kv, il);
-
-            if (use_mrope) {
-                Qcur = ggml_rope_multi(
-                            ctx0, Qcur, inp_pos, rope_factors,
-                            n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
-                            ext_factor, attn_factor, beta_fast, beta_slow
-                            );
-
-                Kcur = ggml_rope_multi(
-                            ctx0, Kcur, inp_pos, rope_factors,
-                            n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
-                            ext_factor, attn_factor, beta_fast, beta_slow
-                            );
-            } else {
-                Qcur = ggml_rope_ext(
-                            ctx0, Qcur, inp_pos, rope_factors,
-                            n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                            ext_factor, attn_factor, beta_fast, beta_slow
-                            );
-
-                Kcur = ggml_rope_ext(
-                            ctx0, Kcur, inp_pos, rope_factors,
-                            n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                            ext_factor, attn_factor, beta_fast, beta_slow
-                            );
-            }
-
-            cb(Qcur, "Qcur", il);
-            cb(Kcur, "Kcur", il);
-            cb(Vcur, "Vcur", il);
-
-            Kcur = build_norm(Kcur,
-                        model.layers[il].attn_k_norm, nullptr,
-                        LLM_NORM_RMS, il);
-            cb(Kcur, "Kcur_norm", il);
-
-            Qcur = build_norm(Qcur,
-                        model.layers[il].attn_q_norm, nullptr,
-                        LLM_NORM_RMS, il);
-            cb(Qcur, "Qcur_norm", il);
-
-            cur = build_attn(inp_attn,
-                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
-                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
-            cb(cur, "attn_out", il);
-        }
-        if (il == n_layer - 1 && inp_out_ids) {
-            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-        }
-        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-        cb(ffn_inp, "ffn_inp", il);
-
-        cur = build_norm(ffn_inp,
-                model.layers[il].ffn_norm, NULL,
-                LLM_NORM_RMS, il);
-        cb(cur, "ffn_norm", il);
-        // feed-forward network (non-MoE)
-        ggml_tensor * cur_mlp = build_ffn(cur,
-                    model.layers[il].ffn_up,   NULL, NULL,
-                    model.layers[il].ffn_gate, NULL, NULL,
-                    model.layers[il].ffn_down, NULL, NULL,
-                    NULL,
-                    LLM_FFN_SILU, LLM_FFN_PAR, il);
-        cb(cur_mlp, "ffn_out", il);
-
-        cur = ggml_add(ctx0, cur_mlp, ffn_inp);
-
-        cur = build_cvec(cur, il);
-        cb(cur, "l_out", il);
-
-        // input for next layer
-        inpL = cur;
-    }
-    cur = inpL;
-
-    cur = build_norm(cur,
-            model.output_norm, NULL,
-            LLM_NORM_RMS, -1);
-
-    cb(cur, "result_norm", -1);
-    res->t_embd = cur;
-    // lm_head
-    cur = build_lora_mm(model.output, cur);
-    cb(cur, "result_output", -1);
-    res->t_logits = cur;
-
-    ggml_build_forward_expand(gf, cur);
+std::unique_ptr<llm_graph_context> llama_model_hunyuan_dense::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
 }
+
diff --git a/src/models/hunyuan-moe.cpp b/src/models/hunyuan-moe.cpp
index ffe1664b0e13..44af42412f74 100644
--- a/src/models/hunyuan-moe.cpp
+++ b/src/models/hunyuan-moe.cpp
@@ -1,6 +1,59 @@
 #include "models.h"
 
-llm_build_hunyuan_moe::llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_hunyuan_moe::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
+    ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
+
+    switch (hparams.n_layer) {
+        case 32: type = LLM_TYPE_A13B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_hunyuan_moe::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+        const uint32_t n_ff_shexp = hparams.n_ff_shexp > 0 ? hparams.n_ff_shexp : hparams.n_ff(i);
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
+        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, 0);
+        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
+        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
+
+        layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
+        layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_shexp}, 0);
+        layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_hunyuan_moe::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_hunyuan_moe::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/hunyuan-vl.cpp b/src/models/hunyuan-vl.cpp
new file mode 100644
index 000000000000..5fb9154bec01
--- /dev/null
+++ b/src/models/hunyuan-vl.cpp
@@ -0,0 +1,189 @@
+#include "models.h"
+
+void llama_model_hunyuan_vl::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
+
+    // XDRoPE / NTK-aware scaling: base = rope_theta * alpha^(dim / (dim - 2))
+    if (hparams.rope_scaling_alpha > 0.0f) {
+        const int dim = hparams.n_embd_head_k();
+        hparams.rope_freq_base_train = hparams.rope_freq_base_train
+            * powf(hparams.rope_scaling_alpha, (float)dim / (float)(dim - 2));
+    }
+
+    switch (hparams.n_embd) {
+        case 1024: type = LLM_TYPE_0_5B; break;
+        case 2048: type = LLM_TYPE_1_8B; break;
+        case 3072: type = LLM_TYPE_4B; break;
+        case 4096: type = LLM_TYPE_7B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_hunyuan_vl::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_hunyuan_vl::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_hunyuan_vl::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+    const int64_t n_embd_head = hparams.n_embd_head_v();
+
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
+
+    const bool use_mrope = hparams.use_mrope();
+
+    int sections[4];
+    std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
+
+    ggml_tensor * cur;
+    ggml_tensor * inpL;
+
+    inpL = build_inp_embd(model.tok_embd);
+
+    // inp_pos - contains the positions
+    ggml_tensor * inp_pos = build_inp_pos();
+
+    auto * inp_attn = build_attn_inp_kv();
+
+    const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
+
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+    for (int il = 0; il < n_layer; ++il) {
+        ggml_tensor * inpSA = inpL;
+
+        // norm
+        cur = build_norm(inpL,
+                model.layers[il].attn_norm, NULL,
+                LLM_NORM_RMS, il);
+        cb(cur, "attn_norm", il);
+        // self-attention
+        {
+            // rope freq factors for llama3; may return nullptr for llama2 and other models
+            ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+            // compute Q and K and RoPE them
+            auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
+                    n_embd_head, n_head, n_head_kv, il);
+
+            if (use_mrope) {
+                Qcur = ggml_rope_multi(
+                            ctx0, Qcur, inp_pos, rope_factors,
+                            n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+                            ext_factor, attn_factor, beta_fast, beta_slow
+                            );
+
+                Kcur = ggml_rope_multi(
+                            ctx0, Kcur, inp_pos, rope_factors,
+                            n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+                            ext_factor, attn_factor, beta_fast, beta_slow
+                            );
+            } else {
+                Qcur = ggml_rope_ext(
+                            ctx0, Qcur, inp_pos, rope_factors,
+                            n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                            ext_factor, attn_factor, beta_fast, beta_slow
+                            );
+
+                Kcur = ggml_rope_ext(
+                            ctx0, Kcur, inp_pos, rope_factors,
+                            n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                            ext_factor, attn_factor, beta_fast, beta_slow
+                            );
+            }
+
+            cb(Qcur, "Qcur", il);
+            cb(Kcur, "Kcur", il);
+            cb(Vcur, "Vcur", il);
+
+            Kcur = build_norm(Kcur,
+                        model.layers[il].attn_k_norm, nullptr,
+                        LLM_NORM_RMS, il);
+            cb(Kcur, "Kcur_norm", il);
+
+            Qcur = build_norm(Qcur,
+                        model.layers[il].attn_q_norm, nullptr,
+                        LLM_NORM_RMS, il);
+            cb(Qcur, "Qcur_norm", il);
+
+            cur = build_attn(inp_attn,
+                    model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
+                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+            cb(cur, "attn_out", il);
+        }
+        if (il == n_layer - 1 && inp_out_ids) {
+            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+        }
+        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+        cb(ffn_inp, "ffn_inp", il);
+
+        cur = build_norm(ffn_inp,
+                model.layers[il].ffn_norm, NULL,
+                LLM_NORM_RMS, il);
+        cb(cur, "ffn_norm", il);
+        // feed-forward network (non-MoE)
+        ggml_tensor * cur_mlp = build_ffn(cur,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, il);
+        cb(cur_mlp, "ffn_out", il);
+
+        cur = ggml_add(ctx0, cur_mlp, ffn_inp);
+
+        cur = build_cvec(cur, il);
+        cb(cur, "l_out", il);
+
+        // input for next layer
+        inpL = cur;
+    }
+    cur = inpL;
+
+    cur = build_norm(cur,
+            model.output_norm, NULL,
+            LLM_NORM_RMS, -1);
+
+    cb(cur, "result_norm", -1);
+    res->t_embd = cur;
+    // lm_head
+    cur = build_lora_mm(model.output, cur);
+    cb(cur, "result_output", -1);
+    res->t_logits = cur;
+
+    ggml_build_forward_expand(gf, cur);
+}
diff --git a/src/models/internlm2.cpp b/src/models/internlm2.cpp
index 83be2ca0aee6..f0c5580a6f41 100644
--- a/src/models/internlm2.cpp
+++ b/src/models/internlm2.cpp
@@ -1,6 +1,43 @@
 #include "models.h"
 
-llm_build_internlm2::llm_build_internlm2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_internlm2::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    switch (hparams.n_layer) {
+        case 32: type = LLM_TYPE_7B; break;
+        case 48: type = LLM_TYPE_20B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_internlm2::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+        // layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
+
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_internlm2::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_internlm2::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/jais.cpp b/src/models/jais.cpp
index 31101f3c14bd..a6451dca095d 100644
--- a/src/models/jais.cpp
+++ b/src/models/jais.cpp
@@ -1,6 +1,58 @@
 #include "models.h"
 
-llm_build_jais::llm_build_jais(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_jais::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+    ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias, false);
+
+    switch (hparams.n_layer) {
+        case 24: type = LLM_TYPE_1_3B; break;
+        case 40: type = LLM_TYPE_13B; break;
+        /* TODO: add variants */
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_jais::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, 0);
+        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, 0);
+
+        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+        layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
+
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+
+        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
+
+        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
+
+        layer.ffn_gate   = create_tensor(tn(LLM_TENSOR_FFN_GATE,   "weight", i), {n_embd, n_ff}, 0);
+        layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE,   "bias", i),   {n_ff}, 0);
+
+        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_jais::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_jais::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/jais2.cpp b/src/models/jais2.cpp
index 507e04fa4aa9..ad59b953e8dd 100644
--- a/src/models/jais2.cpp
+++ b/src/models/jais2.cpp
@@ -1,8 +1,63 @@
 #include "models.h"
 
+void llama_model_jais2::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+
+    switch (hparams.n_layer) {
+        case 32: type = LLM_TYPE_8B; break;
+        case 68: type = LLM_TYPE_70B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_jais2::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+    if (!output) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
+
+        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+        // attention biases - all have shape n_embd (output dimension of projections)
+        layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
+        layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd}, 0);
+        layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd}, 0);
+        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+
+        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
+
+        // Jais-2 uses simple MLP (no gate) with biases
+        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);
+        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_jais2::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
 // JAIS-2 model graph builder
 // Uses: LayerNorm (not RMSNorm), relu2 activation, separate Q/K/V, RoPE embeddings
-llm_build_jais2::llm_build_jais2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+llama_model_jais2::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/jamba.cpp b/src/models/jamba.cpp
index f82b7795c87f..e1b8d137e38c 100644
--- a/src/models/jamba.cpp
+++ b/src/models/jamba.cpp
@@ -1,6 +1,111 @@
 #include "models.h"
 
-llm_build_jamba::llm_build_jamba(const llama_model & model, const llm_graph_params & params) : llm_build_mamba_base(params) {
+void llama_model_jamba::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
+    ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
+    ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
+    ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
+
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+    for (uint32_t i = 0; i < hparams.n_layer; ++i) {
+        hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
+    }
+
+    switch (hparams.n_layer) {
+        // TODO: Jamba layers are a bit heterogeneous, so naming this is hard.
+        case 12: // 900M  8x???M
+        case 32: // 51B  16x?B
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_jamba::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    const int64_t d_conv  = hparams.ssm_d_conv;
+    const int64_t d_inner = hparams.ssm_d_inner;
+    const int64_t d_state = hparams.ssm_d_state;
+    const int64_t dt_rank = hparams.ssm_dt_rank;
+
+    // only an expansion factor of 2 is supported for now
+    GGML_ASSERT(2 * n_embd == d_inner);
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    {
+        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+
+        output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+        // if output is NULL, init from the input tok embed, duplicated to allow offloading
+        if (output == NULL) {
+            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+        }
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        const int64_t n_head_kv = hparams.n_head_kv(i);
+        const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
+
+        auto & layer = layers[i];
+
+        // norm
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        if (n_head_kv == 0) {
+            // Mamba layer
+            layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
+
+            layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
+            layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
+
+            layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
+
+            layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, "weight", i), {dt_rank}, 0);
+
+            layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
+            layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
+
+            layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, "weight", i), {d_state}, 0);
+            layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, "weight", i), {d_state}, 0);
+
+            // no "weight" suffix for these
+            layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
+            layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
+
+            // out_proj
+            layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
+        } else {
+            // Attention layers
+
+            create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
+            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+        }
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
+
+        if (layer.ffn_gate_inp) {
+            // MoE
+            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
+            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
+            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff, n_expert}, 0);
+        } else {
+            // FFN (no MoE)
+            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+        }
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_jamba::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_jamba::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_build_mamba_base(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     ggml_tensor * cur;
diff --git a/src/models/jina-bert-v2.cpp b/src/models/jina-bert-v2.cpp
new file mode 100644
index 000000000000..4f8866ece4d2
--- /dev/null
+++ b/src/models/jina-bert-v2.cpp
@@ -0,0 +1,66 @@
+#include "models.h"
+
+void llama_model_jina_bert_v2::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
+    hparams.f_max_alibi_bias = 8.0f;
+
+    switch (hparams.n_layer) {
+        case 4:  type = LLM_TYPE_33M;  break; // jina-embeddings-small
+        case 12: type = LLM_TYPE_137M; break; // jina-embeddings-base
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_jina_bert_v2::load_arch_tensors(llama_model_loader & ml) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd  = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0); // word_embeddings
+    type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); // token_type_embeddings
+
+    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight", 0), {n_embd}, 0); // LayerNorm
+    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias",   0), {n_embd}, 0); // LayerNorm bias
+
+    cls   = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
+    cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"),   {1},         TENSOR_NOT_REQUIRED);
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i]; // JinaBertLayer
+
+        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
+
+        layer.attn_q_norm   = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+        layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+        layer.attn_k_norm   = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+        layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); //output_dens
+        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); //output_dens
+
+        layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); //output_norm
+        layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias",   i), {n_embd}, 0);
+
+        layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+        layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
+
+        const auto tn_ffn_up_weight = tn(LLM_TENSOR_FFN_UP, "weight", i);
+        ggml_tensor * t_ffn_up = ml.get_tensor_meta(tn_ffn_up_weight.str().c_str());
+        const int64_t n_ffn_up = t_ffn_up ? t_ffn_up->ne[1] : n_ff;
+
+        GGML_ASSERT(n_ffn_up == n_ff || n_ffn_up == n_ff * 2);
+        layer.ffn_up   = create_tensor(tn_ffn_up_weight, {n_embd, n_ffn_up}, 0);
+        layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ffn_up}, TENSOR_NOT_REQUIRED);
+
+        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias",   i), {n_embd}, 0);
+
+        layer.layer_out_norm   = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
+        layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias",   i), {n_embd}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_jina_bert_v2::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
diff --git a/src/models/jina-bert-v3.cpp b/src/models/jina-bert-v3.cpp
new file mode 100644
index 000000000000..e0527529f56c
--- /dev/null
+++ b/src/models/jina-bert-v3.cpp
@@ -0,0 +1,69 @@
+#include "models.h"
+
+void llama_model_jina_bert_v3::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
+
+    switch (hparams.n_layer) {
+        case 24:
+            type = LLM_TYPE_558M; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_jina_bert_v3::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    if (n_token_types == 0) {
+        throw std::runtime_error(arch_name() + " model needs to define token type count");
+    }
+    tok_embd     = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0);
+    type_embd    = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
+
+    if (arch == LLM_ARCH_BERT) {
+        pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,    "weight"), {n_embd, n_ctx_train}, 0);
+
+        cls   = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
+        cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"),   {n_embd},         TENSOR_NOT_REQUIRED);
+
+        cls_out   = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
+        cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"),   {hparams.n_cls_out},         TENSOR_NOT_REQUIRED);
+    }
+
+    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight", 0), {n_embd}, 0);
+    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias",   0), {n_embd}, 0);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
+
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+        layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
+        layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i),   {n_embd}, 0);
+
+        if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
+            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff,   n_expert}, 0);
+            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff,   n_embd, n_expert}, 0);
+            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,   "weight", i), {n_embd, n_expert}, 0);
+        } else {
+            layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, TENSOR_NOT_REQUIRED);
+            layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
+
+            if (arch == LLM_ARCH_NOMIC_BERT) {
+                layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+            }
+        }
+
+        layer.layer_out_norm   = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
+        layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i),   {n_embd}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_jina_bert_v3::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp
index 58c89c417fc3..ecffb105496b 100644
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@@ -1,7 +1,175 @@
 #include "models.h"
-
 #include "llama-memory-recurrent.h"
 
+void llama_model_kimi_linear::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA,    hparams.n_embd_head_k_mla_impl);
+    ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA,  hparams.n_embd_head_v_mla_impl);
+    ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,      hparams.n_lora_kv);
+    ml.get_key(LLM_KV_SSM_CONV_KERNEL,             hparams.ssm_d_conv);
+    ml.get_key(LLM_KV_KDA_HEAD_DIM,                hparams.n_embd_head_kda);
+
+    // MLA qk_rope_head_dim (for reference)
+    // qk_rope_head_dim = 64, qk_nope_head_dim = 128, qk_head_dim = 192
+
+    // Mark KDA layers as recurrent using n_head_kv pattern (like Jamba)
+    // Set n_head_kv = 0 for KDA layers (recurrent), n_head_kv = n_head for MLA layers (attention)
+    for (uint32_t i = 0; i < hparams.n_layer; ++i) {
+        hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;  // KDA layers are recurrent
+    }
+
+    // MoE parameters - Kimi uses moe_intermediate_size = 1024
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
+    ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,               hparams.n_expert_shared);
+    ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead, false);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
+    ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func);
+
+    switch (hparams.n_layer) {
+        case 27: type = LLM_TYPE_48B_A3B; break; // Kimi-Linear-48B-A3B
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_kimi_linear::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        // Check for KDA specific tensors to determine layer type or if it's a mixed model
+        // Assuming KDA layer if KDA tensors are present
+
+        // KDA uses head_dim = 128 (from linear_attn_config.head_dim)
+        const int64_t n_embd_head_k_kda = hparams.n_embd_head_kda;
+        const int64_t n_embd_head_v_kda = hparams.n_embd_head_kda;
+        const int64_t ssm_d_conv = hparams.ssm_d_conv;
+
+        if (hparams.is_recurrent(i)) {
+            // Conv1d weights: try 4D first, then 3D (quantization may remove trailing 1)
+            // 4D: [d_conv, 1, d_inner, 1], 3D: [d_conv, 1, d_inner]
+            layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
+            if (!layer.ssm_q_conv) {
+                layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head}, 0);
+            }
+
+             // KDA Layer - Conv1d weights may be 3D or 4D
+             layer.ssm_k_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_K, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
+             if (!layer.ssm_k_conv) {
+                 layer.ssm_k_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_K, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head}, 0);
+             }
+             layer.ssm_v_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_V, "weight", i), {ssm_d_conv, 1, n_embd_head_v_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
+             if (!layer.ssm_v_conv) {
+                 layer.ssm_v_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_V, "weight", i), {ssm_d_conv, 1, n_embd_head_v_kda * n_head}, 0);
+             }
+
+             // q, k, v projections
+             // Python: q_proj, k_proj, v_proj
+             create_tensor_qkv(layer, i, n_embd, n_embd_head_k_kda * n_head, n_embd_head_k_kda * n_head, n_embd_head_v_kda * n_head, 0);
+
+             // KDA specific projections
+             // f_a_proj, f_b_proj
+             layer.ssm_f_a = create_tensor(tn(LLM_TENSOR_SSM_F_A, "weight", i), {n_embd, n_embd_head_k_kda}, 0); // head_dim
+             layer.ssm_f_b = create_tensor(tn(LLM_TENSOR_SSM_F_B, "weight", i), {n_embd_head_k_kda, n_embd_head_k_kda * n_head}, 0); // projection_size
+
+             // b_proj (beta mixing coefficient)
+             layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", i), {n_embd, n_head}, 0);
+
+             // A_log - Shape in GGUF: [1, num_heads, 1, 1] (4D) or [1, num_heads] (2D after quantization) Note: -exp(A_log) is applied in convert_hf_to_gguf.py
+             layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head, 1, 1}, TENSOR_NOT_REQUIRED);
+             if (!layer.ssm_a) {
+                 layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0);
+             }
+
+             // dt_bias - shape [n_embd_head_k_kda * n_head] = [4096]
+             layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_embd_head_k_kda * n_head}, 0);
+
+             // g_a_proj, g_b_proj (output gate)
+             layer.ssm_g_a = create_tensor(tn(LLM_TENSOR_SSM_G_A, "weight", i), {n_embd, n_embd_head_k_kda}, 0);
+             layer.ssm_g_b = create_tensor(tn(LLM_TENSOR_SSM_G_B, "weight", i), {n_embd_head_k_kda, n_embd_head_k_kda * n_head}, 0);
+
+             // o_norm (reusing SSM_NORM)
+             layer.ssm_o_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {n_embd_head_k_kda}, 0); // FusedRMSNormGated
+
+             // o_proj
+             layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v_kda * n_head, n_embd}, 0);
+
+        } else {
+             // MLA Layer - use MLA-specific head dimensions
+             const int64_t q_lora_rank  = hparams.n_lora_q;
+             const int64_t kv_lora_rank = hparams.n_lora_kv;
+             const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
+             const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
+
+             layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, TENSOR_NOT_REQUIRED);
+             layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
+
+             if (layer.attn_q_a_norm) {
+                 layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
+                 layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
+             } else {
+                 // Kimi MLA without Q compression: wq = [n_embd, n_head * n_embd_head_k_mla]
+                 layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
+             }
+
+             // Kimi: qk_rope_head_dim = 64 (actual RoPE dimension for MLA)
+             // Note: hparams.n_rot may be 72 (from conversion) but actual is 64
+             const int64_t qk_rope_head_dim = hparams.n_rot();  // From config: qk_rope_head_dim
+             layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + qk_rope_head_dim}, 0);
+             // Support Legacy GGUFs that don't split wkv_b (MLA KV cache disabled)
+             layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i),
+                {kv_lora_rank, n_head * (n_embd_head_k_mla - qk_rope_head_dim + n_embd_head_v_mla)}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
+             if (!layer.wkv_b) { // MLA KV cache enabled
+                 layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_k_mla - qk_rope_head_dim, kv_lora_rank, n_head}, 0);
+                 layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
+             }
+             layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
+        }
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        // MoE intermediate size (different from dense FFN)
+        const int64_t n_ff_exp = hparams.n_ff_exp;
+
+        // Kimi uses n_layer_dense_lead to determine which layers use dense FFN vs MoE
+        // first_k_dense_replace = 1 means layer 0 uses dense FFN, layers 1+ use MoE
+        if (i < (int) hparams.n_layer_dense_lead) {
+            // Dense FFN layer - use normal n_ff
+            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+        } else {
+            // MoE layer - use n_ff_exp (1024) instead of n_ff (9216)
+            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
+            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
+            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
+
+            // Shared experts use moe_intermediate_size * num_shared_experts
+            // Kimi: shared_expert_intermediate_size = 1024 * 1 = 1024
+            // Tensors are 2D: [n_embd, n_ff_shexp] or [n_ff_shexp, n_embd]
+            const int64_t n_ff_shexp_actual = n_ff_exp * (hparams.n_expert_shared > 0 ? hparams.n_expert_shared : 1);
+            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp_actual}, TENSOR_NOT_REQUIRED);
+            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp_actual, n_embd}, TENSOR_NOT_REQUIRED);
+            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_shexp_actual}, TENSOR_NOT_REQUIRED);
+
+            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
+        }
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_kimi_linear::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
 // Causal Conv1d function for Q,K,V
 // When qkv is 0, it is Q, 1 is K, 2 is V
 static ggml_tensor * causal_conv1d(ggml_cgraph * gf, ggml_context * ctx0, ggml_tensor * conv_states_all, ggml_tensor * conv_state_all, int64_t qkv, ggml_tensor * x, ggml_tensor * proj_w, ggml_tensor * conv_w, int64_t d_conv, int64_t head_dim, int64_t n_head, int64_t n_seq_tokens, int64_t n_seqs, int64_t n_tokens, int64_t kv_head) {
@@ -63,7 +231,7 @@ static ggml_tensor * causal_conv1d(ggml_cgraph * gf, ggml_context * ctx0, ggml_t
     return ggml_reshape_4d(ctx0, Xcur, head_dim, n_head, n_seq_tokens, n_seqs);
 }
 
-llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params) :
+llama_model_kimi_linear::graph::graph(const llama_model & model, const llm_graph_params & params) :
     llm_build_delta_net_base(params), model(model) {
     ggml_tensor * cur;
     ggml_tensor * inpL;
diff --git a/src/models/lfm2.cpp b/src/models/lfm2.cpp
index eb8ec3c803a3..df6a8028736f 100644
--- a/src/models/lfm2.cpp
+++ b/src/models/lfm2.cpp
@@ -1,10 +1,94 @@
 #include "models.h"
-
 #include "../llama-memory-hybrid-iswa.h"
 #include "../llama-memory-hybrid.h"
 
+void llama_model_lfm2::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_SHORTCONV_L_CACHE,           hparams.n_shortconv_l_cache);
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    for (uint32_t il = 0; il < hparams.n_layer; ++il) {
+        hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
+    }
+    hparams.n_layer_dense_lead = hparams.n_layer;
+    switch (hparams.n_ff()) {
+        case  4608: type = LLM_TYPE_350M; break;
+        case  6912: type = LLM_TYPE_700M; break;
+        case  8192: type = LLM_TYPE_1_2B; break;
+        case 10752: type = LLM_TYPE_2_6B; break;
+        default:    type = LLM_TYPE_UNKNOWN;
+    }
+    if (const auto is_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); is_swa && hparams.n_swa > 0) {
+        hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+        for (uint32_t il = 0; il < hparams.n_layer; ++il) {
+            hparams.swa_layers[il] = !hparams.recurrent_layer_arr[il];
+        }
+    }
+}
+
+void llama_model_lfm2::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM_LFM2, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,           "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        const bool is_moe_layer = i >= static_cast<int>(hparams.n_layer_dense_lead);
+
+        // ffn/moe is same for transformer and conv layers
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        if (is_moe_layer) {
+            GGML_ASSERT(n_expert && n_expert_used);
+            layer.ffn_gate_inp    = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i),  {n_embd, n_expert}, 0);
+            layer.ffn_gate_exps   = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, hparams.n_ff_exp, n_expert}, 0);
+            layer.ffn_down_exps   = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {hparams.n_ff_exp,   n_embd, n_expert}, 0);
+            layer.ffn_up_exps     = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i),   {n_embd, hparams.n_ff_exp, n_expert}, 0);
+            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
+        } else {  // dense
+            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+        }
+
+        // for operator_norm
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        if (!hparams.is_recurrent(i)) {
+            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+            GGML_ASSERT(n_embd_v_gqa == n_embd_k_gqa);
+
+            create_tensor_qkv(layer, i, n_embd, n_embd, hparams.n_embd_k_gqa(i), hparams.n_embd_v_gqa(i), 0);
+
+            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+        } else {
+            layer.shortconv.conv     = create_tensor(tn(LLM_TENSOR_SHORTCONV_CONV,    "weight", i), {hparams.n_shortconv_l_cache, n_embd}, 0);
+            layer.shortconv.in_proj  = create_tensor(tn(LLM_TENSOR_SHORTCONV_INPROJ,  "weight", i), {n_embd, 3 * n_embd}, 0);
+            layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
+        }
+    }
+
+    // for LFM2-ColBert-350M
+    dense_2_out_layers   = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.n_embd_out()}, TENSOR_NOT_REQUIRED);
+    dense_2_out_layers_b = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "bias"),   {hparams.n_embd_out()        }, TENSOR_NOT_REQUIRED);
+}
+
+std::unique_ptr<llm_graph_context> llama_model_lfm2::build_arch_graph(const llm_graph_params & params) const {
+    if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
+        return std::make_unique<graph<true>>(*this, params);
+    } else {
+        return std::make_unique<graph<false>>(*this, params);
+    }
+}
+
 template <bool iswa>
-llm_build_lfm2<iswa>::llm_build_lfm2(const llama_model & model, const llm_graph_params & params) :
+llama_model_lfm2::graph<iswa>::graph(const llama_model & model, const llm_graph_params & params) :
     llm_graph_context(params) {
     using inp_hybrid_type = std::conditional_t<iswa, llm_graph_input_mem_hybrid_iswa,  llm_graph_input_mem_hybrid>;
     using inp_attn_type   = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa,     llm_graph_input_attn_kv>;
@@ -187,5 +271,5 @@ llm_build_lfm2<iswa>::llm_build_lfm2(const llama_model & model, const llm_graph_
 }
 
 // Explicit template instantiations
-template struct llm_build_lfm2<true>;
-template struct llm_build_lfm2<false>;
+template struct llama_model_lfm2::graph<true>;
+template struct llama_model_lfm2::graph<false>;
diff --git a/src/models/lfm2moe.cpp b/src/models/lfm2moe.cpp
new file mode 100644
index 000000000000..12a66c05c7dc
--- /dev/null
+++ b/src/models/lfm2moe.cpp
@@ -0,0 +1,85 @@
+#include "models.h"
+#include "../llama-memory-hybrid-iswa.h"
+#include "../llama-memory-hybrid.h"
+
+void llama_model_lfm2moe::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_SHORTCONV_L_CACHE,           hparams.n_shortconv_l_cache);
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
+    ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func);
+
+    for (uint32_t il = 0; il < hparams.n_layer; ++il) {
+        hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
+    }
+
+    switch (hparams.n_layer) {
+        case 24: type = LLM_TYPE_8B_A1B;  break;
+        case 40: type = LLM_TYPE_24B_A2B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_lfm2moe::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM_LFM2, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,           "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        const bool is_moe_layer = i >= static_cast<int>(hparams.n_layer_dense_lead);
+
+        // ffn/moe is same for transformer and conv layers
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        if (is_moe_layer) {
+            GGML_ASSERT(n_expert && n_expert_used);
+            layer.ffn_gate_inp    = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i),  {n_embd, n_expert}, 0);
+            layer.ffn_gate_exps   = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, hparams.n_ff_exp, n_expert}, 0);
+            layer.ffn_down_exps   = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {hparams.n_ff_exp,   n_embd, n_expert}, 0);
+            layer.ffn_up_exps     = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i),   {n_embd, hparams.n_ff_exp, n_expert}, 0);
+            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
+        } else {  // dense
+            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+        }
+
+        // for operator_norm
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        if (!hparams.is_recurrent(i)) {
+            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+            GGML_ASSERT(n_embd_v_gqa == n_embd_k_gqa);
+
+            create_tensor_qkv(layer, i, n_embd, n_embd, hparams.n_embd_k_gqa(i), hparams.n_embd_v_gqa(i), 0);
+
+            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+        } else {
+            layer.shortconv.conv     = create_tensor(tn(LLM_TENSOR_SHORTCONV_CONV,    "weight", i), {hparams.n_shortconv_l_cache, n_embd}, 0);
+            layer.shortconv.in_proj  = create_tensor(tn(LLM_TENSOR_SHORTCONV_INPROJ,  "weight", i), {n_embd, 3 * n_embd}, 0);
+            layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
+        }
+    }
+
+    // for LFM2-ColBert-350M
+    dense_2_out_layers   = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.n_embd_out()}, TENSOR_NOT_REQUIRED);
+    dense_2_out_layers_b = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "bias"),   {hparams.n_embd_out()        }, TENSOR_NOT_REQUIRED);
+}
+
+std::unique_ptr<llm_graph_context> llama_model_lfm2moe::build_arch_graph(const llm_graph_params & params) const {
+    if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
+        return std::make_unique<graph<true>>(*this, params);
+    } else {
+        return std::make_unique<graph<false>>(*this, params);
+    }
+}
+
diff --git a/src/models/llada-moe.cpp b/src/models/llada-moe.cpp
index c756d6fde5f5..b60f67f6c4bf 100644
--- a/src/models/llada-moe.cpp
+++ b/src/models/llada-moe.cpp
@@ -1,6 +1,56 @@
 #include "models.h"
 
-llm_build_llada_moe::llm_build_llada_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_llada_moe::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
+
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    // diffusion language model uses non-causal attention
+    hparams.causal_attn = false;
+    switch (hparams.n_layer) {
+        case 16: type = LLM_TYPE_A1_7B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_llada_moe::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+    GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for llada-moe");
+    GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for llada-moe");
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+
+        const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
+
+        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
+        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_llada_moe::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_llada_moe::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/llada.cpp b/src/models/llada.cpp
index 501df3c7eaf2..fa21c5fe32ca 100644
--- a/src/models/llada.cpp
+++ b/src/models/llada.cpp
@@ -1,6 +1,72 @@
 #include "models.h"
 
-llm_build_llada::llm_build_llada(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_llada::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    // LLaDA-8B has 32 layers, similar to LLaMA but for diffusion
+    switch (hparams.n_layer) {
+        case 32:
+            type = LLM_TYPE_8B;
+            break;
+        default:
+            type = LLM_TYPE_UNKNOWN;
+    }
+    // Set non-causal attention for diffusion models
+    hparams.causal_attn = false;
+}
+
+void llama_model_llada::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
+    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
+
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output =
+            create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
+
+        // Use separate Q, K, V projections without bias, matching LLaDALlamaBlock
+        layer.wq =
+            create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
+        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
+        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
+        // No bias for QKV projections as per config: include_bias=false, include_qkv_bias=false
+        layer.wo =
+            create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
+        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
+
+        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot / 2 },
+                                         TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
+
+        // optional MLP bias
+        layer.ffn_gate_b =
+            create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
+        layer.ffn_down_b =
+            create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
+        layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_llada::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_llada::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     // LLaDA is similar to LLaMA but uses non-causal attention for diffusion
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
diff --git a/src/models/llama-embed.cpp b/src/models/llama-embed.cpp
new file mode 100644
index 000000000000..0699e744461b
--- /dev/null
+++ b/src/models/llama-embed.cpp
@@ -0,0 +1,6 @@
+#include "models.h"
+
+std::unique_ptr<llm_graph_context> llama_model_llama_embed::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph<true>>(*this, params);
+}
+
diff --git a/src/models/llama.cpp b/src/models/llama.cpp
index 8d478dc67475..10523117c59f 100644
--- a/src/models/llama.cpp
+++ b/src/models/llama.cpp
@@ -1,7 +1,101 @@
 #include "models.h"
 
+void llama_model_llama::load_arch_hparams(llama_model_loader & ml) {
+    const auto n_vocab = vocab.n_tokens();
+
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+    if (hparams.n_expert == 8) {
+        switch (hparams.n_layer) {
+            case 32: type = LLM_TYPE_8x7B; break;
+            case 56: type = LLM_TYPE_8x22B; break;
+            default: type = LLM_TYPE_UNKNOWN;
+        }
+    } else {
+        switch (hparams.n_layer) {
+            case 16: type = LLM_TYPE_1B; break; // Llama 3.2 1B
+            case 22: type = LLM_TYPE_1B; break;
+            case 26: type = LLM_TYPE_3B; break;
+            case 28: type = LLM_TYPE_3B; break; // Llama 3.2 3B
+            case 30: type = LLM_TYPE_256M; break; // smoldocling 256M
+            // granite uses a vocab with len 49152
+            case 32: type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break;
+            case 36: type = LLM_TYPE_8B; break; // granite
+            case 40: type = LLM_TYPE_13B; break;
+            case 48: type = LLM_TYPE_34B; break;
+            case 60: type = LLM_TYPE_30B; break;
+            case 80: type = hparams.n_head() == hparams.n_head_kv() ? LLM_TYPE_65B : LLM_TYPE_70B; break;
+            default: type = LLM_TYPE_UNKNOWN;
+        }
+    }
+}
+
+void llama_model_llama::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+        // optional bias tensors
+        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
+            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+        }
+        else {
+            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+        }
+
+        if (n_expert == 0) {
+            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+
+            // optional MLP bias
+            layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+        } else {
+            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
+            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, TENSOR_NOT_REQUIRED);
+            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
+            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
+
+            // For Granite MoE Shared
+            if (hparams.n_ff_shexp > 0) {
+                layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
+                layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
+                layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
+            }
+        }
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_llama::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph<false>>(*this, params);
+}
+
 template <bool embed>
-llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+llama_model_llama::graph<embed>::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
@@ -149,5 +243,5 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
     ggml_build_forward_expand(gf, cur);
 }
 
-template struct llm_build_llama<false>;
-template struct llm_build_llama<true>;
+template struct llama_model_llama::graph<false>;
+template struct llama_model_llama::graph<true>;
diff --git a/src/models/llama4.cpp b/src/models/llama4.cpp
index 4e4bfb43f332..899611d53f60 100644
--- a/src/models/llama4.cpp
+++ b/src/models/llama4.cpp
@@ -1,7 +1,109 @@
 #include "models.h"
 
+void llama_model_llama4::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
+    ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP,   hparams.n_moe_layer_step);
+
+    const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+    if (found_swa && hparams.n_swa == 0) {
+        hparams.swa_type             = LLAMA_SWA_TYPE_NONE;
+        hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope
+    } else {
+        hparams.swa_type                = LLAMA_SWA_TYPE_CHUNKED;
+        hparams.n_swa                   = 8192;
+        hparams.n_attn_temp_floor_scale = 8192;
+        hparams.f_attn_temp_scale       = 0.1f;
+        hparams.f_attn_temp_offset      = 1.0f;
+        uint32_t swa_period             = 4; // pattern: 3 chunked - 1 full
+        ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+        hparams.set_swa_pattern(swa_period);
+
+        hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
+        hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
+        ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+    }
+
+    switch (hparams.n_expert) {
+        case 0: {
+            // MobileLLM (no MoE)
+            switch (hparams.n_embd) {
+                case 2048: type = LLM_TYPE_140M; break;
+                case 4096: type = LLM_TYPE_360M; break;
+                case 6144: type = LLM_TYPE_950M; break;
+                default:   type = LLM_TYPE_UNKNOWN;
+            }
+        } break;
+        case 16:  type = LLM_TYPE_17B_16E; break;
+        case 128: type = LLM_TYPE_17B_128E; break;
+        default:  type = LLM_TYPE_UNKNOWN;
+    }
+
+    hparams.use_kq_norm = type != LLM_TYPE_17B_128E;
+}
+
+void llama_model_llama4::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    if (n_expert == 0) {
+        throw std::runtime_error(arch_name() + " model cannot have zero experts");
+    }
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        const bool is_moe_layer = hparams.n_moe_layer_step > 0 && (i + 1) % hparams.n_moe_layer_step == 0;
+
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+
+        if (is_moe_layer) {
+            const int64_t n_ff_exp = hparams.n_ff_exp;
+
+            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
+            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff_exp, n_expert}, 0);
+            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff_exp, n_embd, n_expert}, 0);
+            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff_exp, n_expert}, 0);
+
+            // Shared expert
+            const int64_t n_ff_shexp = n_ff_exp;
+            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {    n_embd, n_ff_shexp}, 0);
+            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd    }, 0);
+            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {    n_embd, n_ff_shexp}, 0);
+        } else {
+            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+        }
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_llama4::build_arch_graph(const llm_graph_params & params) const {
+    if (hparams.swa_type == LLAMA_SWA_TYPE_NONE) {
+        return std::make_unique<graph<false>>(*this, params);
+    } else {
+        return std::make_unique<graph<true>>(*this, params);
+    }
+}
+
 template <bool iswa>
-llm_build_llama4<iswa>::llm_build_llama4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+llama_model_llama4::graph<iswa>::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
@@ -167,5 +269,5 @@ llm_build_llama4<iswa>::llm_build_llama4(const llama_model & model, const llm_gr
 }
 
 // Explicit template instantiations
-template struct llm_build_llama4<false>;
-template struct llm_build_llama4<true>;
+template struct llama_model_llama4::graph<false>;
+template struct llama_model_llama4::graph<true>;
diff --git a/src/models/maincoder.cpp b/src/models/maincoder.cpp
index 8a76931c0078..3dbd82fd3620 100644
--- a/src/models/maincoder.cpp
+++ b/src/models/maincoder.cpp
@@ -1,6 +1,49 @@
 #include "models.h"
 
-llm_build_maincoder::llm_build_maincoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_maincoder::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    switch (hparams.n_layer) {
+        case 32: type = LLM_TYPE_1B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_maincoder::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_gqa, n_embd_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_maincoder::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_maincoder::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/mamba.cpp b/src/models/mamba.cpp
index 55fd2e055c49..b7708d7fdd18 100644
--- a/src/models/mamba.cpp
+++ b/src/models/mamba.cpp
@@ -1,6 +1,90 @@
 #include "models.h"
 
-llm_build_mamba::llm_build_mamba(const llama_model & model, const llm_graph_params & params) : llm_build_mamba_base(params) {
+void llama_model_mamba::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
+    ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
+    ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
+    ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
+    ml.get_key(LLM_KV_SSM_DT_B_C_RMS,     hparams.ssm_dt_b_c_rms, false);
+
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+    switch (hparams.n_layer) {
+        case 24:
+            switch (hparams.n_embd) {
+                case 768: type = LLM_TYPE_SMALL; break;
+                default: type = LLM_TYPE_UNKNOWN;
+            } break;
+        case 48:
+            switch (hparams.n_embd) {
+                case 1024: type = LLM_TYPE_MEDIUM; break;
+                case 1536: type = LLM_TYPE_LARGE; break;
+                case 2048: type = LLM_TYPE_XL; break;
+                default:   type = LLM_TYPE_UNKNOWN;
+            } break;
+        case 64:
+            switch (hparams.n_embd) {
+                case 2560: type = LLM_TYPE_3B; break;
+                default: type = LLM_TYPE_UNKNOWN;
+            } break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_mamba::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    const int64_t d_conv  = hparams.ssm_d_conv;
+    const int64_t d_inner = hparams.ssm_d_inner;
+    const int64_t d_state = hparams.ssm_d_state;
+    const int64_t dt_rank = hparams.ssm_dt_rank;
+
+    // only an expansion factor of 2 is supported for now
+    if (2 * n_embd != d_inner) {
+        throw std::runtime_error("only an expansion factor of 2 is supported for now");
+    }
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+
+    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+    // if output is NULL, init from the input tok embed, duplicated to allow offloading
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        // norm
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
+
+        layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
+        layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
+
+        layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
+
+        layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
+        layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
+
+        // no "weight" suffix for these
+        layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
+        layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
+
+        // out_proj
+        layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_mamba::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_mamba::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_build_mamba_base(params) {
     ggml_tensor * cur;
     ggml_tensor * inpL;
 
@@ -51,4 +135,3 @@ llm_build_mamba::llm_build_mamba(const llama_model & model, const llm_graph_para
 
     ggml_build_forward_expand(gf, cur);
 }
-
diff --git a/src/models/mamba2.cpp b/src/models/mamba2.cpp
new file mode 100644
index 000000000000..3277ca53ec4b
--- /dev/null
+++ b/src/models/mamba2.cpp
@@ -0,0 +1,87 @@
+#include "models.h"
+
+void llama_model_mamba2::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
+    ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
+    ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
+    ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
+    ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
+
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+    switch (hparams.n_layer) {
+        case 24:
+            switch (hparams.n_embd) {
+                case 768: type = LLM_TYPE_SMALL; break;
+                default: type = LLM_TYPE_UNKNOWN;
+            } break;
+        case 48:
+            switch (hparams.n_embd) {
+                case 1024: type = LLM_TYPE_MEDIUM; break;
+                case 1536: type = LLM_TYPE_LARGE; break;
+                case 2048: type = LLM_TYPE_XL; break;
+                default: type = LLM_TYPE_UNKNOWN;
+            } break;
+        case 64:
+            switch (hparams.n_embd) {
+                case 2560: type = LLM_TYPE_3B; break;
+                case 4096: type = LLM_TYPE_7B; break;
+                default: type = LLM_TYPE_UNKNOWN;
+            } break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_mamba2::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    const int64_t d_conv  = hparams.ssm_d_conv;
+    const int64_t d_inner = hparams.ssm_d_inner;
+    const int64_t d_state = hparams.ssm_d_state;
+    const int64_t n_group = hparams.ssm_n_group;
+    const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_head;
+
+    // only an expansion factor of 2 is supported for now
+    GGML_ASSERT(2 * n_embd == d_inner);
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    {
+        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+
+        output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+        // if output is NULL, init from the input tok embed, duplicated to allow offloading
+        if (output == NULL) {
+            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+        }
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        // norm
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
+
+        layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
+        layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, 0);
+
+        layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_head}, 0);
+
+        // no "weight" suffix for these
+        layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0);
+        layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_head}, 0);
+
+        layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
+
+        // out_proj
+        layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_mamba2::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
diff --git a/src/models/mimo2-iswa.cpp b/src/models/mimo2.cpp
similarity index 57%
rename from src/models/mimo2-iswa.cpp
rename to src/models/mimo2.cpp
index 52c6acfe2143..eb10d5807a9c 100644
--- a/src/models/mimo2-iswa.cpp
+++ b/src/models/mimo2.cpp
@@ -1,6 +1,64 @@
 #include "models.h"
 
-llm_build_mimo2_iswa::llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_mimo2::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+    ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,   hparams.n_swa);
+    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,         hparams.rope_freq_base_train_swa, false);
+    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
+
+    switch (hparams.n_layer) {
+        case 48: type = LLM_TYPE_310B_A15B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_mimo2::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+        uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
+        uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
+        uint32_t n_head = hparams.n_head(i);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_v * n_head, n_embd }, 0);
+
+        layer.attn_norm  = create_tensor(tn(LLM_TENSOR_ATTN_NORM,  "weight", i), {n_embd}, 0);
+        layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, TENSOR_NOT_REQUIRED);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        // non-MoE branch
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, TENSOR_NOT_REQUIRED);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
+
+        // MoE branch
+        int64_t n_ff_exp = hparams.n_ff_exp;
+        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
+        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp,   n_expert}, TENSOR_NOT_REQUIRED);
+        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, TENSOR_NOT_REQUIRED);
+        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff_exp,   n_expert}, TENSOR_NOT_REQUIRED);
+        layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_mimo2::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_mimo2::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     ggml_tensor * cur;
     ggml_tensor * inpL;
 
diff --git a/src/models/minicpm.cpp b/src/models/minicpm.cpp
new file mode 100644
index 000000000000..966d3af615c0
--- /dev/null
+++ b/src/models/minicpm.cpp
@@ -0,0 +1,89 @@
+#include "models.h"
+
+void llama_model_minicpm::load_arch_hparams(llama_model_loader & ml) {
+    // Backward-compatible defaults for older MiniCPM GGUFs
+    hparams.f_embedding_scale = 12.0f;
+    hparams.f_residual_scale  = 1.4f / sqrtf(float(hparams.n_layer));
+    hparams.f_logit_scale     = hparams.n_embd ? (256.0f / float(hparams.n_embd)) : 1.0f;
+
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+    // Optional KV reads, override defaults if present in newer GGUF exports
+    ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, /*required=*/false);
+    ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale, /*required=*/false);
+    ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, /*required=*/false);
+
+    // MiniCPM uses rope by default, unlike Granite which uses it as a switch
+    hparams.rope_finetuned = true;
+
+    switch (hparams.n_layer) {
+        case 52: type = LLM_TYPE_1B; break;
+        case 40: type = LLM_TYPE_2B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_minicpm::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+        // optional bias tensors
+        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
+            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+        }
+        else {
+            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+        }
+
+        if (n_expert == 0) {
+            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+
+            // optional MLP bias
+            layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+        } else {
+            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
+            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, TENSOR_NOT_REQUIRED);
+            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
+            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
+
+            // For Granite MoE Shared
+            if (hparams.n_ff_shexp > 0) {
+                layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
+                layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
+                layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
+            }
+        }
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_minicpm::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
diff --git a/src/models/minicpm3.cpp b/src/models/minicpm3.cpp
index bf12ab73c74c..ff5eb6ffa5f0 100644
--- a/src/models/minicpm3.cpp
+++ b/src/models/minicpm3.cpp
@@ -1,6 +1,66 @@
 #include "models.h"
 
-llm_build_minicpm3::llm_build_minicpm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_minicpm3::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK,       hparams.n_lora_q);
+    ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,      hparams.n_lora_kv);
+
+    switch (hparams.n_layer) {
+        case 62: type = LLM_TYPE_4B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_minicpm3::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    const int64_t n_embd_head_qk_rope = hparams.n_rot();
+    const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot();
+
+    const int64_t q_lora_rank  = hparams.n_lora_q;
+    const int64_t kv_lora_rank = hparams.n_lora_kv;
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+        layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
+
+        layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
+
+        layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
+        layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
+
+        layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
+        layer.wkv_b     = create_tensor(tn(LLM_TENSOR_ATTN_KV_B,     "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
+        layer.wo        = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {              n_head * (                      n_embd_head_v), n_embd}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+
+        layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+        layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_minicpm3::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_minicpm3::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     //TODO: if the model varies, these parameters need to be read from the model
     const int64_t n_embd_base = 256;
     const float scale_embd  = 12.0f;
diff --git a/src/models/minimax-m2.cpp b/src/models/minimax-m2.cpp
index b809b79f2b9f..0dee89346920 100644
--- a/src/models/minimax-m2.cpp
+++ b/src/models/minimax-m2.cpp
@@ -1,6 +1,50 @@
 #include "models.h"
 
-llm_build_minimax_m2::llm_build_minimax_m2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_minimax_m2::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,  hparams.f_norm_rms_eps);
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,   hparams.n_ff_exp);
+    ml.get_key(LLM_KV_EXPERT_GATING_FUNC,           hparams.expert_gating_func, false);
+
+    switch (hparams.n_layer) {
+        case 62: type = LLM_TYPE_230B_A10B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_minimax_m2::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_gqa, n_embd_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k * n_head}, 0);
+        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_k_gqa}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff,   n_expert}, 0);
+        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert}, 0);
+        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff,   n_expert}, 0);
+        layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_minimax_m2::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_minimax_m2::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/mistral3.cpp b/src/models/mistral3.cpp
index b5ae72a2ee13..708da49af1f0 100644
--- a/src/models/mistral3.cpp
+++ b/src/models/mistral3.cpp
@@ -1,6 +1,96 @@
 #include "models.h"
 
-llm_build_mistral3::llm_build_mistral3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_mistral3::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
+
+    ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast,    false);
+    ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow,    false);
+    ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL,   hparams.rope_yarn_log_mul, false);
+
+    hparams.f_attn_temp_offset = 0.0f;
+
+    // TODO: maybe add n_attn_temp_floor_scale as a separate KV?
+    if (hparams.f_attn_temp_scale != 0.0f) {
+        hparams.n_attn_temp_floor_scale = hparams.n_ctx_orig_yarn;
+        if (hparams.n_attn_temp_floor_scale == 0) {
+            throw std::runtime_error("invalid n_ctx_orig_yarn for attention temperature scaling");
+        }
+    }
+
+    switch (hparams.n_layer) {
+        case 26: type = LLM_TYPE_3B; break;
+        case 34: type = LLM_TYPE_8B; break;
+        case 40: type = LLM_TYPE_14B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_mistral3::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+        // optional bias tensors
+        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
+            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+        }
+        else {
+            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+        }
+
+        if (n_expert == 0) {
+            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+
+            // optional MLP bias
+            layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+        } else {
+            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
+            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, TENSOR_NOT_REQUIRED);
+            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
+            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
+
+            // For Granite MoE Shared
+            if (hparams.n_ff_shexp > 0) {
+                layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
+                layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
+                layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
+            }
+        }
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_mistral3::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_mistral3::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/mistral4.cpp b/src/models/mistral4.cpp
new file mode 100644
index 000000000000..3d9190650e37
--- /dev/null
+++ b/src/models/mistral4.cpp
@@ -0,0 +1,6 @@
+#include "models.h"
+
+std::unique_ptr<llm_graph_context> llama_model_mistral4::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
diff --git a/src/models/models.h b/src/models/models.h
index 94991c55fe87..6d5f18a8e209 100644
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -2,6 +2,7 @@
 
 #include "llama-model.h"
 #include "llama-graph.h"
+#include "llama-model-loader.h"
 
 // note: almost all graphs require at least sqrtf, so include cmath globally
 #include <cmath>
@@ -110,611 +111,1750 @@ struct llm_build_rwkv7_base : public llm_graph_context {
 // models
 //
 
-struct llm_build_afmoe : public llm_graph_context {
-    llm_build_afmoe(const llama_model & model, const llm_graph_params & params);
+struct llama_model_llama : public llama_model_base {
+    llama_model_llama(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    template <bool embed>
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_apertus : public llm_graph_context {
-    llm_build_apertus(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_llama4 : public llama_model_base {
+    llama_model_llama4(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    template <bool iswa>
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_arcee : public llm_graph_context {
-    llm_build_arcee(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_llama_embed : public llama_model_llama {
+    llama_model_llama_embed(const struct llama_model_params & params) : llama_model_llama(params) {}
+    // reuse load_arch_hparams and load_arch_tensors from llama_model_llama
+
+    template <bool embed>
+    using graph = llama_model_llama::graph<embed>;
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_arctic : public llm_graph_context {
-    llm_build_arctic(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_maincoder : public llama_model_base {
+    llama_model_maincoder(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_arwkv7 : public llm_build_rwkv7_base {
-    llm_build_arwkv7(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_deci : public llama_model_base {
+    llama_model_deci(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_baichuan : public llm_graph_context {
-    llm_build_baichuan(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_baichuan : public llama_model_base {
+    llama_model_baichuan(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_bailingmoe2 : public llm_graph_context {
-    llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_falcon : public llama_model_base {
+    llama_model_falcon(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_bailingmoe : public llm_graph_context {
-    llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_grok : public llama_model_base {
+    llama_model_grok(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_bert : public llm_graph_context {
-    llm_build_bert(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_starcoder : public llama_model_base {
+    llama_model_starcoder(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_bitnet : public llm_graph_context {
-    llm_build_bitnet(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_refact : public llama_model_base {
+    llama_model_refact(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_bloom : public llm_graph_context {
-    llm_build_bloom(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_bert : public llama_model_base {
+    llama_model_bert(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_chameleon : public llm_graph_context {
-    llm_build_chameleon(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_jina_bert_v2 : public llama_model_base {
+    llama_model_jina_bert_v2(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    using graph = llama_model_bert::graph;
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_chatglm : public llm_graph_context {
-    llm_build_chatglm(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_jina_bert_v3 : public llama_model_base {
+    llama_model_jina_bert_v3(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    using graph = llama_model_bert::graph;
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_codeshell : public llm_graph_context {
-    llm_build_codeshell(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_nomic_bert : public llama_model_base {
+    llama_model_nomic_bert(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    using graph = llama_model_bert::graph;
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_cogvlm : public llm_graph_context {
-    llm_build_cogvlm(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_nomic_bert_moe : public llama_model_base {
+    llama_model_nomic_bert_moe(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    using graph = llama_model_bert::graph;
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_cohere2_iswa : public llm_graph_context {
-    llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_modern_bert : public llama_model_base {
+    llama_model_modern_bert(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_command_r : public llm_graph_context {
-    llm_build_command_r(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_neo_bert : public llama_model_base {
+    llama_model_neo_bert(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_dbrx : public llm_graph_context {
-    llm_build_dbrx(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_eurobert : public llama_model_base {
+    llama_model_eurobert(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_deci : public llm_graph_context {
-    llm_build_deci(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_bloom : public llama_model_base {
+    llama_model_bloom(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_deepseek2 : public llm_graph_context {
-    llm_build_deepseek2(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_mpt : public llama_model_base {
+    llama_model_mpt(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_deepseek : public llm_graph_context {
-    llm_build_deepseek(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_stablelm : public llama_model_base {
+    llama_model_stablelm(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_dots1 : public llm_graph_context {
-    llm_build_dots1(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_qwen : public llama_model_base {
+    llama_model_qwen(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_dream : public llm_graph_context {
-    llm_build_dream(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_qwen2 : public llama_model_base {
+    llama_model_qwen2(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_ernie4_5 : public llm_graph_context {
-    llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_dream : public llama_model_base {
+    llama_model_dream(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_ernie4_5_moe : public llm_graph_context {
-    llm_build_ernie4_5_moe(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_llada : public llama_model_base {
+    llama_model_llada(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_paddleocr : public llm_graph_context {
-    llm_build_paddleocr(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_llada_moe : public llama_model_base {
+    llama_model_llada_moe(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-template <bool iswa>
-struct llm_build_exaone4 : public llm_graph_context {
-    llm_build_exaone4(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_rnd1 : public llama_model_base {
+    llama_model_rnd1(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_exaone : public llm_graph_context {
-    llm_build_exaone(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_qwen2vl : public llama_model_base {
+    llama_model_qwen2vl(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_exaone_moe : public llm_graph_context {
-    llm_build_exaone_moe(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_qwen2moe : public llama_model_base {
+    llama_model_qwen2moe(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_falcon : public llm_graph_context {
-    llm_build_falcon(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_qwen3 : public llama_model_base {
+    llama_model_qwen3(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_falcon_h1 : public llm_build_mamba_base {
-    llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_qwen3moe : public llama_model_base {
+    llama_model_qwen3moe(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_gemma2_iswa : public llm_graph_context {
-    llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_qwen3vl : public llama_model_base {
+    llama_model_qwen3vl(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-template <bool iswa>
-struct llm_build_gemma3 : public llm_graph_context {
-    llm_build_gemma3(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_qwen3vlmoe : public llama_model_base {
+    llama_model_qwen3vlmoe(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_gemma3n_iswa : public llm_graph_context {
-    const llama_model & model;
 
-    const int64_t n_embd_head;
-    const int64_t n_embd_altup;
-    const int64_t n_altup;
-    const int     i_altup_act;
-    const int     n_layer_sparsity = 10; // number of layers using activation sparsity
-    const float   f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95)
+struct llama_model_phi2 : public llama_model_base {
+    llama_model_phi2(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
+};
 
-    llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params);
-    ggml_tensor * calc_magnitude(ggml_tensor * x);
 
-    // TODO: refactor in common "per-layer" functionality [TAG_PER_LAYER]
-    ggml_tensor * build_inp_per_layer();
-    ggml_tensor * project_per_layer_inputs(ggml_tensor * inp_batch, ggml_tensor * inp_per_layer);
+struct llama_model_phi3 : public llama_model_base {
+    llama_model_phi3(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
 
-    ggml_tensor * gaussian_topk(ggml_tensor * x);
-    ggml_tensor * altup_compute_router_modalities(ggml_tensor * x, int il);
-    ggml_tensor * altup_predict(ggml_tensor * cur, int il);
-    ggml_tensor * laurel(ggml_tensor * cur, int il);
-    ggml_tensor * altup_correct(ggml_tensor * predictions, ggml_tensor * activated, int il);
+    template <bool iswa>
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_gemma4_iswa : public llm_graph_context {
-    const llama_model & model;
 
-    const int64_t n_embd_per_layer;
+struct llama_model_phimoe : public llama_model_base {
+    llama_model_phimoe(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
 
-    llm_build_gemma4_iswa(const llama_model & model, const llm_graph_params & params);
+    template <bool iswa>
+    using graph = llama_model_phi3::graph<iswa>;
 
-    // TODO: refactor in common "per-layer" functionality [TAG_PER_LAYER]
-    ggml_tensor * build_inp_per_layer();
-    ggml_tensor * project_per_layer_inputs(ggml_tensor * inp_batch, ggml_tensor * inp_per_layer);
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_gemma_embedding : public llm_graph_context {
-    llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_plamo : public llama_model_base {
+    llama_model_plamo(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_gemma : public llm_graph_context {
-    llm_build_gemma(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_plamo2 : public llama_model_base {
+    llama_model_plamo2(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_build_mamba_base {
+        graph(const llama_model & model, const llm_graph_params & params);
+        private:
+            ggml_tensor * build_plamo2_mamba_layer(llm_graph_input_rs * inp, ggml_tensor * cur, const llama_model & model, const llama_ubatch & ubatch, int il);
+            ggml_tensor * build_plamo2_attn_layer(llm_graph_input_attn_kv * inp, ggml_tensor * inp_pos, ggml_tensor * cur,
+                                                    const llama_model & model, int il);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_glm4 : public llm_graph_context {
-    llm_build_glm4(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_plamo3 : public llama_model_base {
+    llama_model_plamo3(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    template <bool iswa>
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_glm4_moe : public llm_graph_context {
-    llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_gpt2 : public llama_model_base {
+    llama_model_gpt2(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_gpt2 : public llm_graph_context {
-    llm_build_gpt2(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_codeshell : public llama_model_base {
+    llama_model_codeshell(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_gptneox : public llm_graph_context {
-    llm_build_gptneox(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_orion : public llama_model_base {
+    llama_model_orion(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_granite : public llm_graph_context {
-    llm_build_granite(const llama_model & model, const llm_graph_params & params);
 
-private:
-    ggml_tensor * build_attention_layer(
-              ggml_tensor             * cur,
-              ggml_tensor             * inp_pos,
-              llm_graph_input_attn_kv * inp_attn,
-        const llama_model             & model,
-        const int64_t                 n_embd_head,
-        const int                     il);
+struct llama_model_internlm2 : public llama_model_base {
+    llama_model_internlm2(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
 
-    ggml_tensor * build_layer_ffn(
-              ggml_tensor       * cur,
-              ggml_tensor       * inpSA,
-        const llama_model       & model,
-        const int                 il);
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_granite_hybrid : public llm_build_mamba_base {
-    llm_build_granite_hybrid(const llama_model & model, const llm_graph_params & params);
-    ggml_tensor * build_layer_ffn(ggml_tensor * cur, ggml_tensor * inpSA, const llama_model & model, const int il);
-    ggml_tensor * build_attention_layer(ggml_tensor * cur, ggml_tensor * inp_pos, llm_graph_input_attn_kv * inp_attn,
-        const llama_model & model,const int64_t n_embd_head, const int il);
+
+struct llama_model_minicpm3 : public llama_model_base {
+    llama_model_minicpm3(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_grok : public llm_graph_context {
-    llm_build_grok(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_gemma : public llama_model_base {
+    llama_model_gemma(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_grovemoe : public llm_graph_context {
-    llm_build_grovemoe(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_gemma2 : public llama_model_base {
+    llama_model_gemma2(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_hunyuan_dense : public llm_graph_context {
-    llm_build_hunyuan_dense(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_gemma3 : public llama_model_base {
+    llama_model_gemma3(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    template <bool iswa>
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_hunyuan_moe : public llm_graph_context {
-    llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_gemma3n : public llama_model_base {
+    llama_model_gemma3n(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        const llama_model & model;
+
+        const int64_t n_embd_head;
+        const int64_t n_embd_altup;
+        const int64_t n_altup;
+        const int     i_altup_act;
+        const int     n_layer_sparsity = 10; // number of layers using activation sparsity
+        const float   f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95)
+
+        graph(const llama_model & model, const llm_graph_params & params);
+        ggml_tensor * calc_magnitude(ggml_tensor * x);
+
+        // TODO: refactor in common "per-layer" functionality [TAG_PER_LAYER]
+        ggml_tensor * build_inp_per_layer();
+        ggml_tensor * project_per_layer_inputs(ggml_tensor * inp_batch, ggml_tensor * inp_per_layer);
+
+        ggml_tensor * gaussian_topk(ggml_tensor * x);
+        ggml_tensor * altup_compute_router_modalities(ggml_tensor * x, int il);
+        ggml_tensor * altup_predict(ggml_tensor * cur, int il);
+        ggml_tensor * laurel(ggml_tensor * cur, int il);
+        ggml_tensor * altup_correct(ggml_tensor * predictions, ggml_tensor * activated, int il);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_internlm2 : public llm_graph_context {
-    llm_build_internlm2(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_gemma4 : public llama_model_base {
+    llama_model_gemma4(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        const llama_model & model;
+
+        const int64_t n_embd_per_layer;
+
+        graph(const llama_model & model, const llm_graph_params & params);
+
+        // TODO: refactor in common "per-layer" functionality [TAG_PER_LAYER]
+        ggml_tensor * build_inp_per_layer();
+        ggml_tensor * project_per_layer_inputs(ggml_tensor * inp_batch, ggml_tensor * inp_per_layer);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_jais : public llm_graph_context {
-    llm_build_jais(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_gemma_embedding : public llama_model_base {
+    llama_model_gemma_embedding(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_jais2 : public llm_graph_context {
-    llm_build_jais2(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_starcoder2 : public llama_model_base {
+    llama_model_starcoder2(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_jamba : public llm_build_mamba_base {
-    llm_build_jamba(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_mamba : public llama_model_base {
+    llama_model_mamba(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_build_mamba_base {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_kimi_linear : public llm_build_delta_net_base {
-    llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params);
 
-    std::pair<ggml_tensor *, ggml_tensor *> build_kda_autoregressive(
-                ggml_tensor * q,
-                ggml_tensor * k,
-                ggml_tensor * v,
-                ggml_tensor * gk,
-                ggml_tensor * beta,
-                ggml_tensor * state,
-                        int   il);
+struct llama_model_mamba2 : public llama_model_base {
+    llama_model_mamba2(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
 
-    std::pair<ggml_tensor *, ggml_tensor *> build_kda_chunking(
-                ggml_tensor * q,
-                ggml_tensor * k,
-                ggml_tensor * v,
-                ggml_tensor * gk,
-                ggml_tensor * beta,
-                ggml_tensor * state,
-                ggml_tensor * causal_mask,
-                ggml_tensor * identity,
-                ggml_tensor * diag_mask,
-                        int   il);
+    using graph = llama_model_mamba::graph;
 
-    const llama_model & model;
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-template <bool iswa>
-struct llm_build_lfm2 : public llm_graph_context {
-    llm_build_lfm2(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_jamba : public llama_model_base {
+    llama_model_jamba(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_build_mamba_base {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_llada : public llm_graph_context {
-    llm_build_llada(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_xverse : public llama_model_base {
+    llama_model_xverse(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_llada_moe : public llm_graph_context {
-    llm_build_llada_moe(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_command_r : public llama_model_base {
+    llama_model_command_r(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
+};
+
+
+struct llama_model_cohere2 : public llama_model_base {
+    llama_model_cohere2(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
+};
+
+
+struct llama_model_dbrx : public llama_model_base {
+    llama_model_dbrx(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
+};
+
+
+struct llama_model_olmo : public llama_model_base {
+    llama_model_olmo(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
+};
+
+
+struct llama_model_olmo2 : public llama_model_base {
+    llama_model_olmo2(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    template <bool iswa>
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
+};
+
+
+struct llama_model_olmoe : public llama_model_base {
+    llama_model_olmoe(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
+};
+
+
+struct llama_model_openelm : public llama_model_base {
+    llama_model_openelm(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
+};
+
+
+struct llama_model_gptneox : public llama_model_base {
+    llama_model_gptneox(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
+};
+
+
+struct llama_model_arctic : public llama_model_base {
+    llama_model_arctic(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-template <bool embed>
-struct llm_build_llama : public llm_graph_context {
-    llm_build_llama(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_deepseek : public llama_model_base {
+    llama_model_deepseek(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-template <bool iswa>
-struct llm_build_llama4 : public llm_graph_context {
-    llm_build_llama4(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_deepseek2 : public llama_model_base {
+    llama_model_deepseek2(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_maincoder : public llm_graph_context {
-    llm_build_maincoder(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_deepseek2ocr : public llama_model_base {
+    llama_model_deepseek2ocr(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    using graph = llama_model_deepseek2::graph;
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_mamba : public llm_build_mamba_base {
-    llm_build_mamba(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_glm_dsa : public llama_model_base {
+    llama_model_glm_dsa(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    using graph = llama_model_deepseek2::graph;
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_mimo2_iswa : public llm_graph_context {
-    llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_mistral4 : public llama_model_deepseek2 {
+    llama_model_mistral4(const struct llama_model_params & params) : llama_model_deepseek2(params) {}
+    // reuse load_arch_hparams and load_arch_tensors from llama_model_deepseek2
+
+    using graph = llama_model_deepseek2::graph;
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_minicpm3 : public llm_graph_context {
-    llm_build_minicpm3(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_chatglm : public llama_model_base {
+    llama_model_chatglm(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_minimax_m2 : public llm_graph_context {
-    llm_build_minimax_m2(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_glm4 : public llama_model_base {
+    llama_model_glm4(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_mistral3 : public llm_graph_context {
-    llm_build_mistral3(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_glm4_moe : public llama_model_base {
+    llama_model_glm4_moe(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_modern_bert : public llm_graph_context {
-    llm_build_modern_bert(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_bitnet : public llama_model_base {
+    llama_model_bitnet(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_mpt : public llm_graph_context {
-    llm_build_mpt(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_t5 : public llama_model_base {
+    llama_model_t5(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    template <bool is_enc>
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_nemotron : public llm_graph_context {
-    llm_build_nemotron(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_t5encoder : public llama_model_base {
+    llama_model_t5encoder(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    using graph = llama_model_t5::graph<true>;
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_nemotron_h : public llm_build_mamba_base {
-    llm_build_nemotron_h(const llama_model & model, const llm_graph_params & params);
-    ggml_tensor * build_ffn_layer(ggml_tensor * cur, const llama_model & model, int il);
-    ggml_tensor * build_attention_layer(ggml_tensor * cur, llm_graph_input_attn_kv * inp_attn,
-        const llama_model & model, int64_t n_embd_head, int il);
+
+struct llama_model_jais : public llama_model_base {
+    llama_model_jais(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_neo_bert : public llm_graph_context {
-    llm_build_neo_bert(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_jais2 : public llama_model_base {
+    llama_model_jais2(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_eurobert : public llm_graph_context {
-    llm_build_eurobert(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_nemotron : public llama_model_base {
+    llama_model_nemotron(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-template <bool iswa>
-struct llm_build_olmo2 : public llm_graph_context {
-    llm_build_olmo2(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_nemotron_h : public llama_model_base {
+    llama_model_nemotron_h(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_build_mamba_base {
+        graph(const llama_model & model, const llm_graph_params & params);
+        ggml_tensor * build_ffn_layer(ggml_tensor * cur, const llama_model & model, int il);
+        ggml_tensor * build_attention_layer(ggml_tensor * cur, llm_graph_input_attn_kv * inp_attn,
+            const llama_model & model, int64_t n_embd_head, int il);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_olmoe : public llm_graph_context {
-    llm_build_olmoe(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_nemotron_h_moe : public llama_model_nemotron_h {
+    llama_model_nemotron_h_moe(const struct llama_model_params & params) : llama_model_nemotron_h(params) {}
+    // reuse load_arch_hparams and load_arch_tensors from llama_model_nemotron_h
+
+    using graph = llama_model_nemotron_h::graph;
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_olmo : public llm_graph_context {
-    llm_build_olmo(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_exaone : public llama_model_base {
+    llama_model_exaone(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_openai_moe_iswa : public llm_graph_context {
-    llm_build_openai_moe_iswa(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_exaone4 : public llama_model_base {
+    llama_model_exaone4(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    template <bool iswa>
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_openelm : public llm_graph_context {
-    llm_build_openelm(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_exaone_moe : public llama_model_base {
+    llama_model_exaone_moe(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_orion : public llm_graph_context {
-    llm_build_orion(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_rwkv6 : public llama_model_base {
+    llama_model_rwkv6(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_build_rwkv6_base {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_pangu_embedded : public llm_graph_context {
-    llm_build_pangu_embedded(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_rwkv6qwen2 : public llama_model_base {
+    llama_model_rwkv6qwen2(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_build_rwkv6_base {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_phi2 : public llm_graph_context {
-    llm_build_phi2(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_rwkv7 : public llama_model_base {
+    llama_model_rwkv7(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_build_rwkv7_base {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-template <bool iswa>
-struct llm_build_phi3 : public llm_graph_context {
-    llm_build_phi3(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_arwkv7 : public llama_model_base {
+    llama_model_arwkv7(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_build_rwkv7_base {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_plamo2 : public llm_build_mamba_base {
-    llm_build_plamo2(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_granite : public llama_model_base {
+    llama_model_granite(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+
     private:
-        ggml_tensor * build_plamo2_mamba_layer(llm_graph_input_rs * inp, ggml_tensor * cur, const llama_model & model, const llama_ubatch & ubatch, int il);
-        ggml_tensor * build_plamo2_attn_layer(llm_graph_input_attn_kv * inp, ggml_tensor * inp_pos, ggml_tensor * cur,
-                                                const llama_model & model, int il);
+        ggml_tensor * build_attention_layer(
+                  ggml_tensor             * cur,
+                  ggml_tensor             * inp_pos,
+                  llm_graph_input_attn_kv * inp_attn,
+            const llama_model             & model,
+            const int64_t                 n_embd_head,
+            const int                     il);
+
+        ggml_tensor * build_layer_ffn(
+                  ggml_tensor       * cur,
+                  ggml_tensor       * inpSA,
+            const llama_model       & model,
+            const int                 il);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_plamo : public llm_graph_context {
-    llm_build_plamo(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_granite_moe : public llama_model_base {
+    llama_model_granite_moe(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    using graph = llama_model_granite::graph;
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-template <bool iswa>
-struct llm_build_plamo3 : public llm_graph_context {
-    llm_build_plamo3(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_minicpm : public llama_model_base {
+    llama_model_minicpm(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    using graph = llama_model_granite::graph;
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_plm : public llm_graph_context {
-    llm_build_plm(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_granite_hybrid : public llama_model_base {
+    llama_model_granite_hybrid(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_build_mamba_base {
+        graph(const llama_model & model, const llm_graph_params & params);
+        ggml_tensor * build_layer_ffn(ggml_tensor * cur, ggml_tensor * inpSA, const llama_model & model, const int il);
+        ggml_tensor * build_attention_layer(ggml_tensor * cur, ggml_tensor * inp_pos, llm_graph_input_attn_kv * inp_attn,
+            const llama_model & model,const int64_t n_embd_head, const int il);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_qwen2 : public llm_graph_context {
-    llm_build_qwen2(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_chameleon : public llama_model_base {
+    llama_model_chameleon(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_qwen2moe : public llm_graph_context {
-    llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_wavtokenizer_dec : public llama_model_base {
+    llama_model_wavtokenizer_dec(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_qwen2vl : public llm_graph_context {
-    llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_plm : public llama_model_base {
+    llama_model_plm(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_qwen3 : public llm_graph_context {
-    llm_build_qwen3(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_bailingmoe : public llama_model_base {
+    llama_model_bailingmoe(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_qwen3moe : public llm_graph_context {
-    llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_bailingmoe2 : public llama_model_base {
+    llama_model_bailingmoe2(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_qwen3vl : public llm_graph_context {
-    llm_build_qwen3vl(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_seed_oss : public llama_model_base {
+    llama_model_seed_oss(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_qwen3vlmoe : public llm_graph_context {
-    llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_dots1 : public llama_model_base {
+    llama_model_dots1(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_qwen3next : public llm_build_delta_net_base {
-    llm_build_qwen3next(const llama_model & model, const llm_graph_params & params);
-private:
-    ggml_tensor * build_layer_attn(
-    llm_graph_input_attn_kv * inp_attn,
-                ggml_tensor * cur,
-                ggml_tensor * inp_pos,
-                        int   il);
 
-    ggml_tensor * build_layer_attn_linear(
-         llm_graph_input_rs * inp,
-                ggml_tensor * cur,
-                        int   il);
+struct llama_model_arcee : public llama_model_base {
+    llama_model_arcee(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
 
-    ggml_tensor * build_layer_ffn(
-                ggml_tensor * cur,
-                        int   il);
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
 
-    ggml_tensor * build_norm_gated(
-                ggml_tensor * input,
-                ggml_tensor * weights,
-                ggml_tensor * gate,
-                        int   layer);
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
+};
 
-    // returns pair of qkv, z
-    std::pair<ggml_tensor *, ggml_tensor *> build_qkvz(
-                ggml_tensor * input,
-                        int   il);
 
-    const llama_model & model;
+struct llama_model_afmoe : public llama_model_base {
+    llama_model_afmoe(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_qwen35 : public llm_build_delta_net_base {
-    llm_build_qwen35(const llama_model & model, const llm_graph_params & params);
-private:
-    ggml_tensor * build_layer_attn(
-    llm_graph_input_attn_kv * inp_attn,
-                ggml_tensor * cur,
-                ggml_tensor * inp_pos,
-                        int * sections,
-                        int   il);
 
-    ggml_tensor * build_layer_attn_linear(
-         llm_graph_input_rs * inp,
-                ggml_tensor * cur,
-                        int   il);
+struct llama_model_ernie4_5 : public llama_model_base {
+    llama_model_ernie4_5(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
 
-    ggml_tensor * build_layer_ffn(
-                ggml_tensor * cur,
-                        int   il);
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
 
-    ggml_tensor * build_norm_gated(
-                ggml_tensor * input,
-                ggml_tensor * weights,
-                ggml_tensor * gate,
-                        int   layer);
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
+};
 
-    // returns pair of qkv, z
-    std::pair<ggml_tensor *, ggml_tensor *> build_qkvz(
-                ggml_tensor * input,
-                        int   il);
 
-    const llama_model & model;
+struct llama_model_ernie4_5_moe : public llama_model_ernie4_5 {
+    llama_model_ernie4_5_moe(const struct llama_model_params & params) : llama_model_ernie4_5(params) {}
+    // reuse load_arch_hparams and load_arch_tensors from llama_model_ernie4_5
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-// TODO: derive llm_build_delta_net_base instead
-struct llm_build_qwen35moe : public llm_build_delta_net_base {
-    llm_build_qwen35moe(const llama_model & model, const llm_graph_params & params);
-private:
-    ggml_tensor * build_layer_attn(
-    llm_graph_input_attn_kv * inp_attn,
-                ggml_tensor * cur,
-                ggml_tensor * inp_pos,
-                        int * sections,
-                        int   il);
 
-    ggml_tensor * build_layer_attn_linear(
-         llm_graph_input_rs * inp,
-                ggml_tensor * cur,
-                        int   il);
+struct llama_model_paddleocr : public llama_model_ernie4_5 {
+    llama_model_paddleocr(const struct llama_model_params & params) : llama_model_ernie4_5(params) {}
+    // reuse load_arch_hparams and load_arch_tensors from llama_model_ernie4_5
 
-    ggml_tensor * build_layer_ffn(
-                ggml_tensor * cur,
-                        int   il);
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
 
-    ggml_tensor * build_norm_gated(
-                ggml_tensor * input,
-                ggml_tensor * weights,
-                ggml_tensor * gate,
-                        int   layer);
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
+};
 
-    // returns pair of qkv, z
-    std::pair<ggml_tensor *, ggml_tensor *> build_qkvz(
-                ggml_tensor * input,
-                        int   il);
 
-    const llama_model & model;
+struct llama_model_hunyuan_moe : public llama_model_base {
+    llama_model_hunyuan_moe(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_qwen : public llm_graph_context {
-    llm_build_qwen(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_hunyuan_vl : public llama_model_base {
+    llama_model_hunyuan_vl(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_refact : public llm_graph_context {
-    llm_build_refact(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_hunyuan_dense : public llama_model_hunyuan_vl {
+    llama_model_hunyuan_dense(const struct llama_model_params & params) : llama_model_hunyuan_vl(params) {}
+    // reuse load_arch_hparams and load_arch_tensors from llama_model_hunyuan_vl
+
+    using graph = llama_model_hunyuan_vl::graph;
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_rnd1 : public llm_graph_context {
-    llm_build_rnd1(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_smollm3 : public llama_model_base {
+    llama_model_smollm3(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_rwkv6 : public llm_build_rwkv6_base {
-    llm_build_rwkv6(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_openai_moe : public llama_model_base {
+    llama_model_openai_moe(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
-    llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_falcon_h1 : public llama_model_base {
+    llama_model_falcon_h1(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_build_mamba_base {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_rwkv7 : public llm_build_rwkv7_base {
-    llm_build_rwkv7(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_lfm2 : public llama_model_base {
+    llama_model_lfm2(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    template <bool iswa>
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_seed_oss : public llm_graph_context {
-    llm_build_seed_oss(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_lfm2moe : public llama_model_base {
+    llama_model_lfm2moe(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    template <bool iswa>
+    using graph = llama_model_lfm2::graph<iswa>;
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-template <bool iswa>
-struct llm_build_smallthinker : public llm_graph_context {
-    llm_build_smallthinker(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_smallthinker : public llama_model_base {
+    llama_model_smallthinker(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    template <bool iswa>
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_smollm3 : public llm_graph_context {
-    llm_build_smollm3(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_grovemoe : public llama_model_base {
+    llama_model_grovemoe(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_stablelm : public llm_graph_context {
-    llm_build_stablelm(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_apertus : public llama_model_base {
+    llama_model_apertus(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_starcoder2 : public llm_graph_context {
-    llm_build_starcoder2(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_minimax_m2 : public llama_model_base {
+    llama_model_minimax_m2(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_starcoder : public llm_graph_context {
-    llm_build_starcoder(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_cogvlm : public llama_model_base {
+    llama_model_cogvlm(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_step35_iswa : public llm_graph_context {
-    llm_build_step35_iswa(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_pangu_embed : public llama_model_base {
+    llama_model_pangu_embed(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-template <bool is_enc>
-struct llm_build_t5 : public llm_graph_context {
-    llm_build_t5(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_qwen3next : public llama_model_base {
+    llama_model_qwen3next(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_build_delta_net_base {
+        graph(const llama_model & model, const llm_graph_params & params);
+    private:
+        ggml_tensor * build_layer_attn(
+        llm_graph_input_attn_kv * inp_attn,
+                    ggml_tensor * cur,
+                    ggml_tensor * inp_pos,
+                            int   il);
+
+        ggml_tensor * build_layer_attn_linear(
+             llm_graph_input_rs * inp,
+                    ggml_tensor * cur,
+                            int   il);
+
+        ggml_tensor * build_layer_ffn(
+                    ggml_tensor * cur,
+                            int   il);
+
+        ggml_tensor * build_norm_gated(
+                    ggml_tensor * input,
+                    ggml_tensor * weights,
+                    ggml_tensor * gate,
+                            int   layer);
+
+        // returns pair of qkv, z
+        std::pair<ggml_tensor *, ggml_tensor *> build_qkvz(
+                    ggml_tensor * input,
+                            int   il);
+
+        const llama_model & model;
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_t5encoder : public llm_build_t5<true> {
-    llm_build_t5encoder(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_qwen35 : public llama_model_base {
+    llama_model_qwen35(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_build_delta_net_base {
+        graph(const llama_model & model, const llm_graph_params & params);
+    private:
+        ggml_tensor * build_layer_attn(
+        llm_graph_input_attn_kv * inp_attn,
+                    ggml_tensor * cur,
+                    ggml_tensor * inp_pos,
+                            int * sections,
+                            int   il);
+
+        ggml_tensor * build_layer_attn_linear(
+             llm_graph_input_rs * inp,
+                    ggml_tensor * cur,
+                            int   il);
+
+        ggml_tensor * build_layer_ffn(
+                    ggml_tensor * cur,
+                            int   il);
+
+        ggml_tensor * build_norm_gated(
+                    ggml_tensor * input,
+                    ggml_tensor * weights,
+                    ggml_tensor * gate,
+                            int   layer);
+
+        // returns pair of qkv, z
+        std::pair<ggml_tensor *, ggml_tensor *> build_qkvz(
+                    ggml_tensor * input,
+                            int   il);
+
+        const llama_model & model;
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_wavtokenizer_dec : public llm_graph_context {
-    llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_qwen35moe : public llama_model_base {
+    llama_model_qwen35moe(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_build_delta_net_base {
+        graph(const llama_model & model, const llm_graph_params & params);
+    private:
+        ggml_tensor * build_layer_attn(
+        llm_graph_input_attn_kv * inp_attn,
+                    ggml_tensor * cur,
+                    ggml_tensor * inp_pos,
+                            int * sections,
+                            int   il);
+
+        ggml_tensor * build_layer_attn_linear(
+             llm_graph_input_rs * inp,
+                    ggml_tensor * cur,
+                            int   il);
+
+        ggml_tensor * build_layer_ffn(
+                    ggml_tensor * cur,
+                            int   il);
+
+        ggml_tensor * build_norm_gated(
+                    ggml_tensor * input,
+                    ggml_tensor * weights,
+                    ggml_tensor * gate,
+                            int   layer);
+
+        // returns pair of qkv, z
+        std::pair<ggml_tensor *, ggml_tensor *> build_qkvz(
+                    ggml_tensor * input,
+                            int   il);
+
+        const llama_model & model;
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
+};
+
+
+struct llama_model_mistral3 : public llama_model_base {
+    llama_model_mistral3(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
+};
+
+
+struct llama_model_mimo2 : public llama_model_base {
+    llama_model_mimo2(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
-struct llm_build_xverse : public llm_graph_context {
-    llm_build_xverse(const llama_model & model, const llm_graph_params & params);
+
+struct llama_model_kimi_linear : public llama_model_base {
+    llama_model_kimi_linear(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_build_delta_net_base {
+        graph(const llama_model & model, const llm_graph_params & params);
+
+        std::pair<ggml_tensor *, ggml_tensor *> build_kda_autoregressive(
+                    ggml_tensor * q,
+                    ggml_tensor * k,
+                    ggml_tensor * v,
+                    ggml_tensor * gk,
+                    ggml_tensor * beta,
+                    ggml_tensor * state,
+                            int   il);
+
+        std::pair<ggml_tensor *, ggml_tensor *> build_kda_chunking(
+                    ggml_tensor * q,
+                    ggml_tensor * k,
+                    ggml_tensor * v,
+                    ggml_tensor * gk,
+                    ggml_tensor * beta,
+                    ggml_tensor * state,
+                    ggml_tensor * causal_mask,
+                    ggml_tensor * identity,
+                    ggml_tensor * diag_mask,
+                            int   il);
+
+        const llama_model & model;
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
+};
+
+
+struct llama_model_step35 : public llama_model_base {
+    llama_model_step35(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
diff --git a/src/models/modern-bert.cpp b/src/models/modern-bert.cpp
index 5c6a1b5e1bcc..e9b79ffc6dc0 100644
--- a/src/models/modern-bert.cpp
+++ b/src/models/modern-bert.cpp
@@ -1,6 +1,69 @@
 #include "models.h"
 
-llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_modern_bert::load_arch_hparams(llama_model_loader & ml) {
+    const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+    if (found_swa && hparams.n_swa > 0) {
+        hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
+        ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+        uint32_t swa_period = 3;
+        ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+        hparams.set_swa_pattern(swa_period, true);
+    } else {
+        hparams.swa_type = LLAMA_SWA_TYPE_NONE;
+    }
+
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+
+    switch (hparams.n_layer) {
+        case 12:
+            type = LLM_TYPE_47M; break; // granite-embedding-small
+        case 22:
+            type = LLM_TYPE_149M; break; // modern-bert-base
+        case 28:
+            type = LLM_TYPE_395M; break; // modern-bert-large
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_modern_bert::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+    tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight", 0), {n_embd}, 0);
+
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+
+    for(int i = 0; i < n_layer; ++i) {
+        auto& layer = layers[i];
+
+        if ( i != 0 ) {
+            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+        } else{
+            // layer 0 uses identity
+            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+        }
+
+
+        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3 * n_embd }, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT,   "weight", i), {n_embd, n_embd}, 0);
+
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, 2 * n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+    }
+
+    cls_out   = create_tensor(tn(LLM_TENSOR_CLS_OUT,  "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
+    cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT,  "bias"),   {hparams.n_cls_out},         TENSOR_NOT_REQUIRED);
+    cls       = create_tensor(tn(LLM_TENSOR_CLS,      "weight"), {n_embd, n_embd},            TENSOR_NOT_REQUIRED);
+    cls_norm  = create_tensor(tn(LLM_TENSOR_CLS_NORM, "weight"), {n_embd},                    TENSOR_NOT_REQUIRED);
+
+}
+
+std::unique_ptr<llm_graph_context> llama_model_modern_bert::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_modern_bert::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/mpt.cpp b/src/models/mpt.cpp
index 8596bbb2024c..cfc60e8de293 100644
--- a/src/models/mpt.cpp
+++ b/src/models/mpt.cpp
@@ -1,6 +1,70 @@
 #include "models.h"
 
-llm_build_mpt::llm_build_mpt(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_mpt::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,  hparams.f_norm_eps);
+    ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV,      hparams.f_clamp_kqv, false);
+    ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias, false);
+
+    switch (hparams.n_layer) {
+        case 32: type = LLM_TYPE_7B; break;
+        case 48: type = LLM_TYPE_30B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_mpt::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+    pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,   "weight"), {n_embd, n_ctx_train}, TENSOR_NOT_REQUIRED);
+
+    // output
+    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, TENSOR_NOT_REQUIRED);
+
+    output        = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+    if (!output) {
+        output    = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
+
+        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+        layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
+
+        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
+
+        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
+
+        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
+
+        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, TENSOR_NOT_REQUIRED);
+
+        // FIXME test-llama-archs crashes if q_norm is created
+        layer.attn_q_norm   = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
+        layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
+
+        layer.attn_k_norm   = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+        layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+        // AWQ ScaleActivation layer
+        layer.ffn_act = create_tensor(tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, TENSOR_NOT_REQUIRED);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_mpt::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_mpt::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/nemotron-h-moe.cpp b/src/models/nemotron-h-moe.cpp
new file mode 100644
index 000000000000..a59cc6c9fbd7
--- /dev/null
+++ b/src/models/nemotron-h-moe.cpp
@@ -0,0 +1,6 @@
+#include "models.h"
+
+std::unique_ptr<llm_graph_context> llama_model_nemotron_h_moe::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
diff --git a/src/models/nemotron-h.cpp b/src/models/nemotron-h.cpp
index dc07d43df589..865461f61db8 100644
--- a/src/models/nemotron-h.cpp
+++ b/src/models/nemotron-h.cpp
@@ -1,6 +1,127 @@
 #include "models.h"
 
-llm_build_nemotron_h::llm_build_nemotron_h(const llama_model & model, const llm_graph_params & params) :
+void llama_model_nemotron_h::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
+    ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
+    ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
+    ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
+    ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
+
+    // A layer is recurrent IFF the n_head_kv value is set to 0 and
+    // the n_ff value is set to 0
+    for (uint32_t i = 0; i < hparams.n_layer; ++i) {
+        hparams.recurrent_layer_arr[i] = (hparams.n_head_kv(i) == 0 && hparams.n_ff(i) == 0);
+    }
+
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp,        false);
+    ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp,      false);
+    ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,               hparams.n_expert_shared, false);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,               hparams.expert_weights_norm, false);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
+    ml.get_key(LLM_KV_MOE_LATENT_SIZE,                   hparams.moe_latent_size, false);
+
+    switch (hparams.n_layer) {
+        case 52: type = LLM_TYPE_31B_A3_5B; break; // Nemotron-H_MOE 31B
+        case 56: type = LLM_TYPE_9B; break;
+        case 88: type = LLM_TYPE_120B_A12B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_nemotron_h::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    // mamba2 Mixer SSM params
+    // NOTE: int64_t for tensor dimensions
+    const int64_t d_conv     = hparams.ssm_d_conv;
+    const int64_t d_inner    = hparams.ssm_d_inner;
+    const int64_t d_state    = hparams.ssm_d_state;
+    const int64_t n_ssm_head = hparams.ssm_dt_rank;
+    const int64_t n_group    = hparams.ssm_n_group;
+    const int64_t d_in_proj  = 2*d_inner + 2*n_group*d_state + n_ssm_head;
+    const int64_t moe_n_embd = hparams.moe_latent_size > 0 ? hparams.moe_latent_size : n_embd;
+
+    // embeddings
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    {
+        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+        output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+        // if output is NULL, init from the input tok embed, duplicated to allow offloading
+        if (output == NULL) {
+            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+        }
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        // all blocks use the attn norm
+        layer.attn_norm  = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        if (hparams.is_recurrent(i)) {
+            // ssm layers
+            layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
+
+            layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
+            layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
+
+            layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
+
+            // no "weight" suffix for these
+            layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
+            layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
+
+            layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
+
+            // out_proj
+            layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
+        } else if (hparams.n_ff(i) == 0) {
+            // attention layers (with optional bias)
+            const int64_t n_head_i = hparams.n_head(i);
+            const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
+            const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
+            create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head_i, n_embd_k_gqa_i, n_embd_v_gqa_i, 0);
+            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
+            layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+        }  else {
+            if (n_expert != 0) {
+                const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
+                const int64_t n_ff_shexp = hparams.n_ff_shexp;
+
+                layer.ffn_gate_inp    = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), { n_embd, n_expert}, 0);
+                layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert         }, 0);
+
+                // MoE branch
+                layer.ffn_latent_down = create_tensor(tn(LLM_TENSOR_FFN_LATENT_DOWN, "weight", i), {n_embd, moe_n_embd}, TENSOR_NOT_REQUIRED);
+                layer.ffn_latent_up   = create_tensor(tn(LLM_TENSOR_FFN_LATENT_UP,   "weight", i), {moe_n_embd, n_embd}, TENSOR_NOT_REQUIRED);
+
+                layer.ffn_down_exps   = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   moe_n_embd, n_expert}, 0);
+                layer.ffn_up_exps     = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {moe_n_embd, n_ff_exp, n_expert}, 0);
+
+                // Shared expert branch
+                layer.ffn_down_shexp  = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
+                layer.ffn_up_shexp    = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_shexp}, 0);
+
+            } else {
+                // mlp layers
+                layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  hparams.n_ff(i), n_embd}, 0);
+                layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   hparams.n_ff(i)}, 0);
+                layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
+                layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias",   i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
+            }
+        }
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_nemotron_h::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_nemotron_h::graph::graph(const llama_model & model, const llm_graph_params & params) :
     llm_build_mamba_base(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
@@ -60,7 +181,7 @@ llm_build_nemotron_h::llm_build_nemotron_h(const llama_model & model, const llm_
     ggml_build_forward_expand(gf, cur);
 }
 
-ggml_tensor * llm_build_nemotron_h::build_attention_layer(ggml_tensor *             cur,
+ggml_tensor * llama_model_nemotron_h::graph::build_attention_layer(ggml_tensor *             cur,
                                                           llm_graph_input_attn_kv * inp_attn,
                                                           const llama_model &       model,
                                                                 int64_t             n_embd_head,
@@ -76,7 +197,7 @@ ggml_tensor * llm_build_nemotron_h::build_attention_layer(ggml_tensor *
     return cur;
 }
 
-ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const llama_model & model, int il) {
+ggml_tensor * llama_model_nemotron_h::graph::build_ffn_layer(ggml_tensor * cur, const llama_model & model, int il) {
     if (model.layers[il].ffn_gate_inp == nullptr) {
         cur = build_ffn(cur,
                 model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   model.layers[il].ffn_up_s,
diff --git a/src/models/nemotron.cpp b/src/models/nemotron.cpp
index 054b16fe0efd..0c72ed297aa6 100644
--- a/src/models/nemotron.cpp
+++ b/src/models/nemotron.cpp
@@ -1,6 +1,52 @@
 #include "models.h"
 
-llm_build_nemotron::llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_nemotron::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+    switch (hparams.n_layer) {
+        case 32: type = LLM_TYPE_4B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_nemotron::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+    output        = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+        // optional bias tensors
+        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
+
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+
+        // optional MLP bias
+        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_nemotron::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_nemotron::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/neo-bert.cpp b/src/models/neo-bert.cpp
index da68024a34d1..f00d6eddfc99 100644
--- a/src/models/neo-bert.cpp
+++ b/src/models/neo-bert.cpp
@@ -1,6 +1,46 @@
 #include "models.h"
 
-llm_build_neo_bert::llm_build_neo_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_neo_bert::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+    if (hparams.n_layer == 28) {
+        type = LLM_TYPE_250M;
+    }
+}
+
+void llama_model_neo_bert::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0);
+
+    cls   = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
+    cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"),   {n_embd},         TENSOR_NOT_REQUIRED);
+
+    cls_out   = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
+    cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"),   {hparams.n_cls_out},         TENSOR_NOT_REQUIRED);
+
+    output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff*2}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_neo_bert::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_neo_bert::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/nomic-bert-moe.cpp b/src/models/nomic-bert-moe.cpp
new file mode 100644
index 000000000000..a17abe2c2692
--- /dev/null
+++ b/src/models/nomic-bert-moe.cpp
@@ -0,0 +1,72 @@
+#include "models.h"
+
+void llama_model_nomic_bert_moe::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
+    ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS,         hparams.moe_every_n_layers, 0);
+
+    if (hparams.n_layer == 12 && hparams.n_embd == 768) {
+        if (arch == LLM_ARCH_NOMIC_BERT) {
+            type = LLM_TYPE_137M;
+        } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
+            type = LLM_TYPE_475M;
+        }
+    }
+}
+
+void llama_model_nomic_bert_moe::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    if (n_token_types == 0) {
+        throw std::runtime_error(arch_name() + " model needs to define token type count");
+    }
+    tok_embd     = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0);
+    type_embd    = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
+
+    if (arch == LLM_ARCH_BERT) {
+        pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,    "weight"), {n_embd, n_ctx_train}, 0);
+
+        cls   = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
+        cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"),   {n_embd},         TENSOR_NOT_REQUIRED);
+
+        cls_out   = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
+        cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"),   {hparams.n_cls_out},         TENSOR_NOT_REQUIRED);
+    }
+
+    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight", 0), {n_embd}, 0);
+    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias",   0), {n_embd}, 0);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
+
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+        layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
+        layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i),   {n_embd}, 0);
+
+        if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
+            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff,   n_expert}, 0);
+            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff,   n_embd, n_expert}, 0);
+            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,   "weight", i), {n_embd, n_expert}, 0);
+        } else {
+            layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, TENSOR_NOT_REQUIRED);
+            layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
+
+            if (arch == LLM_ARCH_NOMIC_BERT) {
+                layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+            }
+        }
+
+        layer.layer_out_norm   = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
+        layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i),   {n_embd}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_nomic_bert_moe::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
diff --git a/src/models/nomic-bert.cpp b/src/models/nomic-bert.cpp
new file mode 100644
index 000000000000..5a8a5584457e
--- /dev/null
+++ b/src/models/nomic-bert.cpp
@@ -0,0 +1,72 @@
+#include "models.h"
+
+void llama_model_nomic_bert::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
+    ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS,         hparams.moe_every_n_layers, 0);
+
+    if (hparams.n_layer == 12 && hparams.n_embd == 768) {
+        if (arch == LLM_ARCH_NOMIC_BERT) {
+            type = LLM_TYPE_137M;
+        } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
+            type = LLM_TYPE_475M;
+        }
+    }
+}
+
+void llama_model_nomic_bert::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    if (n_token_types == 0) {
+        throw std::runtime_error(arch_name() + " model needs to define token type count");
+    }
+    tok_embd     = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0);
+    type_embd    = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
+
+    if (arch == LLM_ARCH_BERT) {
+        pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,    "weight"), {n_embd, n_ctx_train}, 0);
+
+        cls   = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
+        cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"),   {n_embd},         TENSOR_NOT_REQUIRED);
+
+        cls_out   = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
+        cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"),   {hparams.n_cls_out},         TENSOR_NOT_REQUIRED);
+    }
+
+    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight", 0), {n_embd}, 0);
+    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias",   0), {n_embd}, 0);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
+
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+        layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
+        layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i),   {n_embd}, 0);
+
+        if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
+            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff,   n_expert}, 0);
+            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff,   n_embd, n_expert}, 0);
+            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,   "weight", i), {n_embd, n_expert}, 0);
+        } else {
+            layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, TENSOR_NOT_REQUIRED);
+            layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
+
+            if (arch == LLM_ARCH_NOMIC_BERT) {
+                layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+            }
+        }
+
+        layer.layer_out_norm   = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
+        layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i),   {n_embd}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_nomic_bert::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
diff --git a/src/models/olmo.cpp b/src/models/olmo.cpp
index a9974025f07d..161035e72bc9 100644
--- a/src/models/olmo.cpp
+++ b/src/models/olmo.cpp
@@ -1,6 +1,46 @@
 #include "models.h"
 
-llm_build_olmo::llm_build_olmo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_olmo::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+    ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV,     hparams.f_clamp_kqv, false);
+
+    switch (hparams.n_layer) {
+        case 22: type = LLM_TYPE_1B; break;
+        case 32: type = LLM_TYPE_7B; break;
+        case 80: type = LLM_TYPE_70B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_olmo::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_olmo::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_olmo::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/olmo2.cpp b/src/models/olmo2.cpp
index 308d2a600c2e..9633f2699657 100644
--- a/src/models/olmo2.cpp
+++ b/src/models/olmo2.cpp
@@ -1,7 +1,68 @@
 #include "models.h"
 
+void llama_model_olmo2::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+    const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+    if (found_swa && hparams.n_swa > 0) {
+        hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+        uint32_t swa_period = 4;
+        ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+        hparams.set_swa_pattern(swa_period);
+
+        hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
+        hparams.rope_freq_scale_train_swa = 1.0; // See olmo2.cpp
+        ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+    } else {
+        hparams.swa_type = LLAMA_SWA_TYPE_NONE;
+    }
+
+    switch (hparams.n_layer) {
+        case 16: type = LLM_TYPE_1B; break;
+        case 32: type = LLM_TYPE_7B; break;
+        case 40: type = LLM_TYPE_13B; break;
+        case 64: type = LLM_TYPE_32B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_olmo2::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    const int64_t n_embd_head = n_embd / n_head;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
+        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_head_kv * n_embd_head}, 0);
+        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_olmo2::build_arch_graph(const llm_graph_params & params) const {
+    if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
+        return std::make_unique<graph<true>>(*this, params);
+    } else {
+        return std::make_unique<graph<false>>(*this, params);
+    }
+}
+
 template <bool iswa>
-llm_build_olmo2<iswa>::llm_build_olmo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+llama_model_olmo2::graph<iswa>::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
@@ -146,5 +207,5 @@ llm_build_olmo2<iswa>::llm_build_olmo2(const llama_model & model, const llm_grap
 }
 
 // Explicit template instantiations
-template struct llm_build_olmo2<false>;
-template struct llm_build_olmo2<true>;
+template struct llama_model_olmo2::graph<false>;
+template struct llama_model_olmo2::graph<true>;
diff --git a/src/models/olmoe.cpp b/src/models/olmoe.cpp
index ed46a00ef909..4bb9013054c0 100644
--- a/src/models/olmoe.cpp
+++ b/src/models/olmoe.cpp
@@ -1,6 +1,55 @@
 #include "models.h"
 
-llm_build_olmoe::llm_build_olmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_olmoe::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    switch (hparams.n_layer) {
+        case 16: type = LLM_TYPE_A1_7B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_olmoe::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
+        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+
+        if (n_expert == 0) {
+            throw std::runtime_error("n_expert must be > 0");
+        }
+        if (n_expert_used == 0) {
+            throw std::runtime_error("n_expert_used must be > 0");
+        }
+
+        // MoE branch
+        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff,   n_expert}, 0);
+        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert}, 0);
+        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff,   n_expert}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_olmoe::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_olmoe::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/openai-moe-iswa.cpp b/src/models/openai-moe.cpp
similarity index 51%
rename from src/models/openai-moe-iswa.cpp
rename to src/models/openai-moe.cpp
index 50992b8d5060..13a590ce6460 100644
--- a/src/models/openai-moe-iswa.cpp
+++ b/src/models/openai-moe.cpp
@@ -1,6 +1,67 @@
 #include "models.h"
 
-llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_openai_moe::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
+    ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa);
+
+    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+    uint32_t swa_period = 2;
+    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+    hparams.set_swa_pattern(swa_period);
+
+    hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
+    hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
+    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+
+    switch (hparams.n_layer) {
+        case 24: type = LLM_TYPE_20B; break;
+        case 36: type = LLM_TYPE_120B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_openai_moe::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    const int64_t n_ff_exp = hparams.n_ff_exp;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), {n_embd}, 0);
+        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_head * n_rot, n_head_kv * n_rot, n_head_kv * n_rot, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
+
+        layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, 0);
+
+        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {  n_embd, n_expert}, 0);
+        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
+        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+
+        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+
+        layer.ffn_gate_inp_b  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "bias", i), {n_expert}, 0);
+        layer.ffn_gate_exps_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
+        layer.ffn_down_exps_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "bias", i), {  n_embd, n_expert}, 0);
+        layer.ffn_up_exps_b   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "bias", i), {n_ff_exp, n_expert}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_openai_moe::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_openai_moe::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     ggml_tensor * cur;
     ggml_tensor * inpL;
 
diff --git a/src/models/openelm.cpp b/src/models/openelm.cpp
index 514ac33517f5..b4128e116e77 100644
--- a/src/models/openelm.cpp
+++ b/src/models/openelm.cpp
@@ -1,6 +1,53 @@
 #include "models.h"
 
-llm_build_openelm::llm_build_openelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_openelm::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+    switch (hparams.n_layer) {
+    case 16: type = LLM_TYPE_270M; break;
+    case 20: type = LLM_TYPE_450M; break;
+    case 28: type = LLM_TYPE_1B; break;
+    case 36: type = LLM_TYPE_3B; break;
+    default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_openelm::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    // init output from the input tok embed
+    output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+
+    for (int i = 0; i < n_layer; ++i) {
+        const int64_t n_head      =   hparams.n_head(i);
+        const int64_t n_head_qkv  = 2*hparams.n_head_kv(i) + n_head;
+        const int64_t n_ff        =   hparams.n_ff(i);
+
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k}, 0);
+        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_openelm::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_openelm::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/orion.cpp b/src/models/orion.cpp
index a5874b6dee77..7ace0a5139d4 100644
--- a/src/models/orion.cpp
+++ b/src/models/orion.cpp
@@ -1,6 +1,46 @@
 #include "models.h"
 
-llm_build_orion::llm_build_orion(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_orion::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+
+    switch (hparams.n_layer) {
+        case 40: type = LLM_TYPE_14B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_orion::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
+
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_orion::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_orion::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/paddleocr.cpp b/src/models/paddleocr.cpp
index 56cb1d94c5f1..1c0eadefa984 100644
--- a/src/models/paddleocr.cpp
+++ b/src/models/paddleocr.cpp
@@ -1,6 +1,10 @@
 #include "models.h"
 
-llm_build_paddleocr::llm_build_paddleocr(const llama_model & model, const llm_graph_params & params) :
+std::unique_ptr<llm_graph_context> llama_model_paddleocr::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_paddleocr::graph::graph(const llama_model & model, const llm_graph_params & params) :
     llm_graph_context(params) {
 
     // NOTE: same with qwen2vl.cpp, but bias tensors are optional
diff --git a/src/models/pangu-embedded.cpp b/src/models/pangu-embed.cpp
similarity index 53%
rename from src/models/pangu-embedded.cpp
rename to src/models/pangu-embed.cpp
index 53464f21d22a..41b7e2ac23ec 100644
--- a/src/models/pangu-embedded.cpp
+++ b/src/models/pangu-embed.cpp
@@ -1,6 +1,60 @@
 #include "models.h"
 
-llm_build_pangu_embedded::llm_build_pangu_embedded(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_pangu_embed::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    switch (hparams.n_layer) {
+        case 26: type = LLM_TYPE_1B; break; // openPangu-Embedded-1B-V1.1
+        case 34: type = LLM_TYPE_7B; break; // openPangu-Embedded-7B-V1.1
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_pangu_embed::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        // weight tensors
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+        // bias tensors
+        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
+            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+        } else {
+            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+        }
+
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_pangu_embed::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_pangu_embed::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/phi2.cpp b/src/models/phi2.cpp
index 0fb3ffa2e63d..a333602c72d5 100644
--- a/src/models/phi2.cpp
+++ b/src/models/phi2.cpp
@@ -1,6 +1,50 @@
 #include "models.h"
 
-llm_build_phi2::llm_build_phi2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_phi2::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+
+    switch (hparams.n_layer) {
+        case 24: type = LLM_TYPE_1B; break;
+        case 32: type = LLM_TYPE_3B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_phi2::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+    output_b      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "bias"),   {n_vocab}, 0);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
+
+        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, 0);
+
+        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
+
+        layer.ffn_up     = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_phi2::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_phi2::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/phi3.cpp b/src/models/phi3.cpp
index 39af285d3c52..0a65e91fefa4 100644
--- a/src/models/phi3.cpp
+++ b/src/models/phi3.cpp
@@ -1,7 +1,71 @@
 #include "models.h"
 
+void llama_model_phi3::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+    switch (hparams.n_layer) {
+        case 24: type = LLM_TYPE_1B; break;
+        case 32: type = LLM_TYPE_3B; break;
+        case 40: type = LLM_TYPE_14B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+
+    const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+
+    if (found_swa && hparams.n_swa > 0) {
+        LLAMA_LOG_WARN("%s: Phi SWA is currently disabled - results might be suboptimal for some models (see %s)\n",
+                __func__, "https://github.com/ggml-org/llama.cpp/pull/13676");
+
+        // TODO: fix conversion scripts to correctly populate `n_swa` and `n_swa_pattern`
+        hparams.swa_type = LLAMA_SWA_TYPE_NONE;
+
+        hparams.n_swa         = 0;
+        hparams.set_swa_pattern(1);
+    }
+}
+
+void llama_model_phi3::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, TENSOR_NOT_REQUIRED);
+        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
+
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
+        layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
+
+        layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+        layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_phi3::build_arch_graph(const llm_graph_params & params) const {
+    if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
+        return std::make_unique<graph<true>> (*this, params);
+    } else {
+        return std::make_unique<graph<false>>(*this, params);
+    }
+}
+
 template<bool iswa>
-llm_build_phi3<iswa>::llm_build_phi3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+llama_model_phi3::graph<iswa>::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
@@ -128,5 +192,5 @@ llm_build_phi3<iswa>::llm_build_phi3(const llama_model & model, const llm_graph_
 }
 
 // Explicit template instantiations
-template struct llm_build_phi3<false>;
-template struct llm_build_phi3<true>;
+template struct llama_model_phi3::graph<false>;
+template struct llama_model_phi3::graph<true>;
diff --git a/src/models/phimoe.cpp b/src/models/phimoe.cpp
new file mode 100644
index 000000000000..4575d6139cf5
--- /dev/null
+++ b/src/models/phimoe.cpp
@@ -0,0 +1,55 @@
+#include "models.h"
+
+void llama_model_phimoe::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+    switch (hparams.n_layer) {
+        case 32: type = LLM_TYPE_16x3_8B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_phimoe::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    const int64_t n_embd_head = n_embd / n_head;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
+
+    // output
+    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
+    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), { n_embd, n_vocab }, 0);
+    output_b      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "bias"),   { n_vocab }, 0);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
+        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias",   i), { n_embd }, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
+        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
+        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias",   i), { n_embd }, 0);
+
+        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
+        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias",   i), { n_embd }, 0);
+
+        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert},         0);
+        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff,   n_expert}, 0);
+        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert}, 0);
+        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff,   n_expert}, 0);
+
+        layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+        layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+     }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_phimoe::build_arch_graph(const llm_graph_params & params) const {
+    if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
+        return std::make_unique<graph<true>> (*this, params);
+    } else {
+        return std::make_unique<graph<false>>(*this, params);
+    }
+}
+
diff --git a/src/models/plamo.cpp b/src/models/plamo.cpp
index 4d5c84506c2c..4c16c20a0d43 100644
--- a/src/models/plamo.cpp
+++ b/src/models/plamo.cpp
@@ -1,6 +1,42 @@
 #include "models.h"
 
-llm_build_plamo::llm_build_plamo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_plamo::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+    switch (hparams.n_layer) {
+        case 40: type = LLM_TYPE_13B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+   }
+}
+
+void llama_model_plamo::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_plamo::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_plamo::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/plamo2.cpp b/src/models/plamo2.cpp
index b6142daebd9e..29c8702606a2 100644
--- a/src/models/plamo2.cpp
+++ b/src/models/plamo2.cpp
@@ -1,8 +1,109 @@
 #include "models.h"
-
 #include "llama-memory-recurrent.h"
 
-llm_build_plamo2::llm_build_plamo2(const llama_model & model, const llm_graph_params & params) :
+void llama_model_plamo2::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+    // Load Mamba SSM parameters
+    ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
+    ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
+    ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
+    ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
+    ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
+
+    for (uint32_t i = 0; i < hparams.n_layer; ++i) {
+        hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
+    }
+
+    switch (hparams.n_layer) {
+        case 16: type = LLM_TYPE_1B; break;
+        case 32:
+            if (hparams.n_embd == 2048) {
+                type = LLM_TYPE_2B;
+            } else if (hparams.n_embd == 4096) {
+                type = LLM_TYPE_8B;
+            }
+            break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_plamo2::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    // mamba parameters
+    const uint32_t d_conv             = hparams.ssm_d_conv;
+    const uint32_t d_state            = hparams.ssm_d_state;
+    const uint32_t num_heads          = hparams.ssm_dt_rank;
+    const uint32_t intermediate_size  = hparams.ssm_d_inner;
+    const int64_t dt_dim              = std::max(64, int(hparams.n_embd / 16));
+
+    // attention parameters
+    const uint32_t qk_dim = hparams.n_embd_head_k();
+    const uint32_t v_dim  = hparams.n_embd_head_v();
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+        bool is_mamba_layer = hparams.is_recurrent(i);
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        if (is_mamba_layer) {
+            layer.ssm_in       = create_tensor(tn(LLM_TENSOR_SSM_IN,     "weight", i), {n_embd, 2 * intermediate_size}, 0);
+            layer.ssm_conv1d   = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, intermediate_size}, 0);
+
+            layer.ssm_x    = create_tensor(tn(LLM_TENSOR_SSM_X,  "weight", i), {intermediate_size, dt_dim + 2*d_state}, 0);
+            layer.ssm_dt   = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_dim, num_heads}, 0);
+            layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {num_heads}, 0);
+
+            layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {num_heads}, 0);
+            layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {num_heads}, 0);
+
+            layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {intermediate_size, n_embd}, 0);
+
+            layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, i), {dt_dim}, 0);
+            layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, i), {d_state}, 0);
+            layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, i), {d_state}, 0);
+        } else {
+            const int64_t num_attention_heads = hparams.n_head(i);
+            const int64_t q_num_heads         = num_attention_heads;
+            const int64_t num_key_value_heads = hparams.n_head_kv(i);
+            const int64_t k_num_heads         = num_key_value_heads;
+            const int64_t v_num_heads         = num_key_value_heads;
+            const int64_t q_proj_dim          = q_num_heads * qk_dim;
+            const int64_t k_proj_dim          = k_num_heads * qk_dim;
+            const int64_t v_proj_dim          = v_num_heads * v_dim;
+
+            layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, q_proj_dim + k_proj_dim + v_proj_dim}, 0);
+            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {qk_dim, num_attention_heads}, 0);
+            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {qk_dim, k_num_heads}, 0);
+            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {q_num_heads * v_dim, n_embd}, 0);
+        }
+
+        // All layers have post-attention norm, FFN norm, and FFN tensors
+        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0);
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff * 2}, 0);
+        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_plamo2::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_plamo2::graph::graph(const llama_model & model, const llm_graph_params & params) :
     llm_build_mamba_base(params) {
     ggml_tensor * cur;
     ggml_tensor * inpL;
@@ -95,7 +196,7 @@ llm_build_plamo2::llm_build_plamo2(const llama_model & model, const llm_graph_pa
     ggml_build_forward_expand(gf, cur);
 }
 
-ggml_tensor * llm_build_plamo2::build_plamo2_attn_layer(llm_graph_input_attn_kv * inp,
+ggml_tensor * llama_model_plamo2::graph::build_plamo2_attn_layer(llm_graph_input_attn_kv * inp,
                                                         ggml_tensor *             inp_pos,
                                                         ggml_tensor *             cur,
                                                         const llama_model &       model,
@@ -150,7 +251,7 @@ ggml_tensor * llm_build_plamo2::build_plamo2_attn_layer(llm_graph_input_attn_kv
     return cur;
 }
 
-ggml_tensor * llm_build_plamo2::build_plamo2_mamba_layer(llm_graph_input_rs * inp,
+ggml_tensor * llama_model_plamo2::graph::build_plamo2_mamba_layer(llm_graph_input_rs * inp,
                                                          ggml_tensor *        cur,
                                                          const llama_model &  model,
                                                          const llama_ubatch & ubatch,
diff --git a/src/models/plamo3.cpp b/src/models/plamo3.cpp
index 67844c09f244..849f1579e639 100644
--- a/src/models/plamo3.cpp
+++ b/src/models/plamo3.cpp
@@ -1,7 +1,74 @@
 #include "models.h"
 
+void llama_model_plamo3::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+    if (found_swa && hparams.n_swa > 0) {
+        hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+        ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+        uint32_t swa_period = 8;
+        ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+        hparams.set_swa_pattern(swa_period);
+    } else {
+        hparams.swa_type = LLAMA_SWA_TYPE_NONE;
+    }
+
+    switch (hparams.n_layer) {
+        case 24: type = LLM_TYPE_2B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_plamo3::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    const int64_t head_dim_q = hparams.n_embd_head_k();
+    const int64_t head_dim_v = hparams.n_embd_head_v();
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        const int64_t num_attention_heads = hparams.n_head(i);
+        const int64_t num_key_value_heads = hparams.n_head_kv(i);
+        const int64_t q_proj_dim = num_attention_heads * head_dim_q;
+        const int64_t k_proj_dim = num_key_value_heads * head_dim_q;
+        const int64_t v_proj_dim = num_key_value_heads * head_dim_v;
+        const int64_t n_ff_cur   = hparams.n_ff(i);
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i),
+                {n_embd,q_proj_dim + k_proj_dim + v_proj_dim}, 0);
+        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {head_dim_q}, 0);
+        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {head_dim_q}, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {num_attention_heads * head_dim_v, n_embd}, 0);
+        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
+
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff_cur * 2}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff_cur, n_embd}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_plamo3::build_arch_graph(const llm_graph_params & params) const {
+    if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
+        return std::make_unique<graph<true>> (*this, params);
+    } else {
+        return std::make_unique<graph<false>>(*this, params);
+    }
+}
+
 template <bool iswa>
-llm_build_plamo3<iswa>::llm_build_plamo3(const llama_model & model, const llm_graph_params & params) :
+llama_model_plamo3::graph<iswa>::graph(const llama_model & model, const llm_graph_params & params) :
     llm_graph_context(params) {
     const int64_t head_dim_q = hparams.n_embd_head_k();
     const int64_t head_dim_v = hparams.n_embd_head_v();
@@ -126,5 +193,5 @@ llm_build_plamo3<iswa>::llm_build_plamo3(const llama_model & model, const llm_gr
 }
 
 // Explicit template instantiations
-template struct llm_build_plamo3<false>;
-template struct llm_build_plamo3<true>;
+template struct llama_model_plamo3::graph<false>;
+template struct llama_model_plamo3::graph<true>;
diff --git a/src/models/plm.cpp b/src/models/plm.cpp
index abce6b34d043..57f5995103ba 100644
--- a/src/models/plm.cpp
+++ b/src/models/plm.cpp
@@ -1,6 +1,50 @@
 #include "models.h"
 
-llm_build_plm::llm_build_plm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_plm::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
+    switch (hparams.n_layer) {
+        case 32: type = LLM_TYPE_1_8B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_plm::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    const int64_t n_embd_head_qk_rope = hparams.n_rot();
+    const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot();
+    const int64_t kv_lora_rank = hparams.n_lora_kv;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    // output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+    output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.wq        = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+        layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
+        layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
+        layer.wkv_b     = create_tensor(tn(LLM_TENSOR_ATTN_KV_B,     "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
+        layer.wo        = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {              n_head * (                      n_embd_head_v), n_embd}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_plm::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_plm::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k()));
 
     const uint32_t n_embd_head_qk_rope = hparams.n_rot();
diff --git a/src/models/qwen.cpp b/src/models/qwen.cpp
index 44e75d874379..cdc076cdf776 100644
--- a/src/models/qwen.cpp
+++ b/src/models/qwen.cpp
@@ -1,6 +1,46 @@
 #include "models.h"
 
-llm_build_qwen::llm_build_qwen(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_qwen::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+    switch (hparams.n_layer) {
+        case 32: type = LLM_TYPE_7B; break;
+        case 40: type = LLM_TYPE_13B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_qwen::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}, 0);
+        layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}, 0);
+        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff/2}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_qwen::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_qwen::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/qwen2.cpp b/src/models/qwen2.cpp
index 2892dd750876..6320458a13b4 100644
--- a/src/models/qwen2.cpp
+++ b/src/models/qwen2.cpp
@@ -1,6 +1,55 @@
 #include "models.h"
 
-llm_build_qwen2::llm_build_qwen2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_qwen2::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    switch (hparams.n_layer) {
+        case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
+        case 28: type = hparams.n_embd == 1536 ? LLM_TYPE_1_5B : LLM_TYPE_7B; break;
+        case 32: type = LLM_TYPE_7B; break;
+        case 36: type = LLM_TYPE_3B; break;
+        case 40: type = hparams.n_head() == 20 ? LLM_TYPE_4B : LLM_TYPE_13B; break;
+        case 48: type = LLM_TYPE_14B; break;
+        case 64: type = LLM_TYPE_32B; break;
+        case 80: type = LLM_TYPE_70B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_qwen2::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+    output_b    = create_tensor(tn(LLM_TENSOR_OUTPUT,      "bias"),   {n_vocab}, TENSOR_NOT_REQUIRED);
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_qwen2::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_qwen2::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/qwen2moe.cpp b/src/models/qwen2moe.cpp
index 5f0a6861b68d..7587c802c680 100644
--- a/src/models/qwen2moe.cpp
+++ b/src/models/qwen2moe.cpp
@@ -1,6 +1,67 @@
 #include "models.h"
 
-llm_build_qwen2moe::llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_qwen2moe::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp, false);
+    ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
+
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    switch (hparams.n_layer) {
+        case 24: type = LLM_TYPE_A2_7B; break;
+        case 28: type = LLM_TYPE_57B_A14B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_qwen2moe::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+
+        if (n_expert == 0) {
+            throw std::runtime_error("n_expert must be > 0 for QWEN2MOE");
+        }
+        if (n_expert_used == 0) {
+            throw std::runtime_error("n_expert_used must be > 0 for QWEN2MOE");
+        }
+
+        // MoE branch
+        const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
+
+        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
+        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+
+        // Shared expert branch
+        const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
+
+        layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd}, 0);
+        layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {    n_embd, n_ff_shexp}, 0);
+        layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp,     n_embd}, 0);
+        layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {    n_embd, n_ff_shexp}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_qwen2moe::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_qwen2moe::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/qwen2vl.cpp b/src/models/qwen2vl.cpp
index da7937c7667f..1a40fa89be4d 100644
--- a/src/models/qwen2vl.cpp
+++ b/src/models/qwen2vl.cpp
@@ -1,6 +1,45 @@
 #include "models.h"
 
-llm_build_qwen2vl::llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_qwen2vl::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
+}
+// fall through
+
+void llama_model_qwen2vl::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+    output_b    = create_tensor(tn(LLM_TENSOR_OUTPUT,      "bias"),   {n_vocab}, TENSOR_NOT_REQUIRED);
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_qwen2vl::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_qwen2vl::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/qwen3.cpp b/src/models/qwen3.cpp
index 883dd5f9a905..fa656c84ea08 100644
--- a/src/models/qwen3.cpp
+++ b/src/models/qwen3.cpp
@@ -1,6 +1,55 @@
 #include "models.h"
 
-llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_qwen3::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    switch (hparams.n_layer) {
+        case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
+        case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
+        case 40: type = LLM_TYPE_14B; break;
+        case 64: type = LLM_TYPE_32B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_qwen3::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    // output rerank head
+    cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_gqa, n_embd_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_qwen3::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_qwen3::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp
index 87790f08e4ee..f276be61ba87 100644
--- a/src/models/qwen35.cpp
+++ b/src/models/qwen35.cpp
@@ -1,8 +1,96 @@
 #include "models.h"
-
 #include "llama-memory-recurrent.h"
 
-llm_build_qwen35::llm_build_qwen35(const llama_model & model, const llm_graph_params & params) :
+void llama_model_qwen35::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
+    ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS,    hparams.rope_sections, 4, true);
+
+    // Load linear attention (gated delta net) parameters
+    ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
+    ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
+    ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
+    ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
+    ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
+
+    // Mark recurrent layers (linear attention layers)
+    {
+        uint32_t full_attn_interval = 4;
+        ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false);
+        for (uint32_t i = 0; i < hparams.n_layer; ++i) {
+            hparams.recurrent_layer_arr[i] = ((i + 1) % full_attn_interval != 0);
+        }
+    }
+
+    switch (hparams.n_layer) {
+        case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_8B : LLM_TYPE_2B; break;
+        case 32: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_9B; break;
+        case 64: type = LLM_TYPE_27B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_qwen35::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
+    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
+
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
+    }
+
+    // Calculate dimensions from hyperparameters
+    const int64_t head_k_dim = hparams.ssm_d_state;
+    const int64_t head_v_dim = hparams.ssm_d_state;
+    const int64_t n_k_heads  = hparams.ssm_n_group;
+    const int64_t n_v_heads  = hparams.ssm_dt_rank;
+    const int64_t key_dim    = head_k_dim * n_k_heads;
+    const int64_t value_dim  = head_v_dim * n_v_heads;
+    const int64_t conv_dim   = key_dim * 2 + value_dim;
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), { n_embd }, 0);
+        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
+
+        if (!hparams.is_recurrent(i)) {
+            // Attention layers
+            create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0);
+            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
+
+            // Q/K normalization for attention layers
+            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
+            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
+        } else {
+            // Linear attention (gated delta net) specific tensors
+            // Create tensors with calculated dimensions
+            layer.wqkv           = create_tensor(tn(LLM_TENSOR_ATTN_QKV,       "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
+            layer.wqkv_gate      = create_tensor(tn(LLM_TENSOR_ATTN_GATE,      "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
+            layer.ssm_conv1d     = create_tensor(tn(LLM_TENSOR_SSM_CONV1D,     "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
+            layer.ssm_dt         = create_tensor(tn(LLM_TENSOR_SSM_DT,         "bias",   i), { hparams.ssm_dt_rank }, 0);
+            layer.ssm_a          = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN,             i), { hparams.ssm_dt_rank }, 0);
+            layer.ssm_beta       = create_tensor(tn(LLM_TENSOR_SSM_BETA,       "weight", i), { n_embd, n_v_heads }, 0);
+            layer.ssm_alpha      = create_tensor(tn(LLM_TENSOR_SSM_ALPHA,      "weight", i), { n_embd, n_v_heads }, 0);
+            layer.ssm_norm       = create_tensor(tn(LLM_TENSOR_SSM_NORM,       "weight", i), { head_v_dim }, 0);
+            layer.ssm_out        = create_tensor(tn(LLM_TENSOR_SSM_OUT,        "weight", i), { value_dim, n_embd }, 0);
+        }
+
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_qwen35::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_params & params) :
     llm_build_delta_net_base(params), model(model) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
@@ -87,7 +175,7 @@ llm_build_qwen35::llm_build_qwen35(const llama_model & model, const llm_graph_pa
     ggml_build_forward_expand(gf, cur);
 }
 
-std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen35::build_qkvz(
+std::pair<ggml_tensor *, ggml_tensor *> llama_model_qwen35::graph::build_qkvz(
                 ggml_tensor * input,
                         int   il) {
     const int64_t n_seqs       = ubatch.n_seqs;
@@ -103,7 +191,7 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen35::build_qkvz(
     return { qkv_mixed, z };
 }
 
-ggml_tensor * llm_build_qwen35::build_norm_gated(
+ggml_tensor * llama_model_qwen35::graph::build_norm_gated(
         ggml_tensor * input,
         ggml_tensor * weights,
         ggml_tensor * gate,
@@ -114,7 +202,7 @@ ggml_tensor * llm_build_qwen35::build_norm_gated(
     return ggml_mul(ctx0, normalized, gated_silu);
 }
 
-ggml_tensor * llm_build_qwen35::build_layer_attn(
+ggml_tensor * llama_model_qwen35::graph::build_layer_attn(
         llm_graph_input_attn_kv * inp,
         ggml_tensor *             cur,
         ggml_tensor *             inp_pos,
@@ -195,7 +283,7 @@ ggml_tensor * llm_build_qwen35::build_layer_attn(
     return cur;
 }
 
-ggml_tensor * llm_build_qwen35::build_layer_attn_linear(
+ggml_tensor * llama_model_qwen35::graph::build_layer_attn_linear(
         llm_graph_input_rs * inp,
         ggml_tensor *        cur,
         int                  il) {
@@ -369,7 +457,7 @@ ggml_tensor * llm_build_qwen35::build_layer_attn_linear(
     return cur;
 }
 
-ggml_tensor * llm_build_qwen35::build_layer_ffn(ggml_tensor * cur, const int il) {
+ggml_tensor * llama_model_qwen35::graph::build_layer_ffn(ggml_tensor * cur, const int il) {
     // Qwen3.5 does not use MoE FFN
     GGML_ASSERT(model.layers[il].ffn_gate_inp == nullptr);
 
diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp
index 7dc6a23c7518..cf05dc9d61cf 100644
--- a/src/models/qwen35moe.cpp
+++ b/src/models/qwen35moe.cpp
@@ -1,8 +1,109 @@
 #include "models.h"
-
 #include "llama-memory-recurrent.h"
 
-llm_build_qwen35moe::llm_build_qwen35moe(const llama_model & model, const llm_graph_params & params) :
+void llama_model_qwen35moe::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp, false);
+    ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
+
+    ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS,    hparams.rope_sections, 4, true);
+
+    // Load linear attention (gated delta net) parameters
+    ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
+    ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
+    ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
+    ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
+    ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
+
+    // Mark recurrent layers (linear attention layers)
+    {
+        uint32_t full_attn_interval = 4;
+        ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false);
+        for (uint32_t i = 0; i < hparams.n_layer; ++i) {
+            hparams.recurrent_layer_arr[i] = ((i + 1) % full_attn_interval != 0);
+        }
+    }
+
+    switch (hparams.n_layer) {
+        case 40: type = LLM_TYPE_35B_A3B; break;
+        case 48: type = LLM_TYPE_122B_A10B; break;
+        case 60: type = LLM_TYPE_397B_A17B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_qwen35moe::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
+    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
+
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
+    }
+
+    const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
+
+    // Calculate dimensions from hyperparameters
+    const int64_t head_k_dim = hparams.ssm_d_state;
+    const int64_t head_v_dim = hparams.ssm_d_state;
+    const int64_t n_k_heads  = hparams.ssm_n_group;
+    const int64_t n_v_heads  = hparams.ssm_dt_rank;
+    const int64_t key_dim    = head_k_dim * n_k_heads;
+    const int64_t value_dim  = head_v_dim * n_v_heads;
+    const int64_t conv_dim   = key_dim * 2 + value_dim;
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), { n_embd }, 0);
+        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
+
+        if (!hparams.is_recurrent(i)) {
+            // Attention layers
+            create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0);
+            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
+
+            // Q/K normalization for attention layers
+            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
+            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
+        } else {
+            // Linear attention (gated delta net) specific tensors
+            // Create tensors with calculated dimensions
+            layer.wqkv           = create_tensor(tn(LLM_TENSOR_ATTN_QKV,       "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
+            layer.wqkv_gate      = create_tensor(tn(LLM_TENSOR_ATTN_GATE,      "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
+            layer.ssm_conv1d     = create_tensor(tn(LLM_TENSOR_SSM_CONV1D,     "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
+            layer.ssm_dt         = create_tensor(tn(LLM_TENSOR_SSM_DT,         "bias",   i), { hparams.ssm_dt_rank }, 0);
+            layer.ssm_a          = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN,             i), { hparams.ssm_dt_rank }, 0);
+            layer.ssm_beta       = create_tensor(tn(LLM_TENSOR_SSM_BETA,       "weight", i), { n_embd, n_v_heads }, 0);
+            layer.ssm_alpha      = create_tensor(tn(LLM_TENSOR_SSM_ALPHA,      "weight", i), { n_embd, n_v_heads }, 0);
+            layer.ssm_norm       = create_tensor(tn(LLM_TENSOR_SSM_NORM,       "weight", i), { head_v_dim }, 0);
+            layer.ssm_out        = create_tensor(tn(LLM_TENSOR_SSM_OUT,        "weight", i), { value_dim, n_embd }, 0);
+        }
+
+        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), { n_embd, n_expert }, 0);
+        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
+        create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0);
+
+        // Shared experts
+        const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
+
+        layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
+        layer.ffn_gate_shexp     = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP,     "weight", i), { n_embd, n_ff_shexp }, 0);
+        layer.ffn_up_shexp       = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,       "weight", i), { n_embd, n_ff_shexp }, 0);
+        layer.ffn_down_shexp     = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP,     "weight", i), { n_ff_shexp, n_embd }, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_qwen35moe::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_params & params) :
     llm_build_delta_net_base(params), model(model) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
@@ -87,7 +188,7 @@ llm_build_qwen35moe::llm_build_qwen35moe(const llama_model & model, const llm_gr
     ggml_build_forward_expand(gf, cur);
 }
 
-std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen35moe::build_qkvz(
+std::pair<ggml_tensor *, ggml_tensor *> llama_model_qwen35moe::graph::build_qkvz(
                 ggml_tensor * input,
                         int   il) {
     const int64_t n_seqs       = ubatch.n_seqs;
@@ -103,7 +204,7 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen35moe::build_qkvz(
     return { qkv_mixed, z };
 }
 
-ggml_tensor * llm_build_qwen35moe::build_norm_gated(
+ggml_tensor * llama_model_qwen35moe::graph::build_norm_gated(
         ggml_tensor * input,
         ggml_tensor * weights,
         ggml_tensor * gate,
@@ -114,7 +215,7 @@ ggml_tensor * llm_build_qwen35moe::build_norm_gated(
     return ggml_mul(ctx0, normalized, gated_silu);
 }
 
-ggml_tensor * llm_build_qwen35moe ::build_layer_attn(
+ggml_tensor * llama_model_qwen35moe::graph::build_layer_attn(
         llm_graph_input_attn_kv * inp,
         ggml_tensor *             cur,
         ggml_tensor *             inp_pos,
@@ -195,7 +296,7 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn(
     return cur;
 }
 
-ggml_tensor * llm_build_qwen35moe ::build_layer_attn_linear(
+ggml_tensor * llama_model_qwen35moe::graph::build_layer_attn_linear(
         llm_graph_input_rs * inp,
         ggml_tensor *        cur,
         int                  il) {
@@ -369,7 +470,7 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn_linear(
     return cur;
 }
 
-ggml_tensor * llm_build_qwen35moe ::build_layer_ffn(ggml_tensor * cur, const int il) {
+ggml_tensor * llama_model_qwen35moe::graph::build_layer_ffn(ggml_tensor * cur, const int il) {
     // Check if this is an MoE layer
     GGML_ASSERT(model.layers[il].ffn_gate_inp != nullptr);
 
diff --git a/src/models/qwen3moe.cpp b/src/models/qwen3moe.cpp
index 16bedba994d5..4440b83aa456 100644
--- a/src/models/qwen3moe.cpp
+++ b/src/models/qwen3moe.cpp
@@ -1,6 +1,65 @@
 #include "models.h"
 
-llm_build_qwen3moe::llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_qwen3moe::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp, false);
+
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    switch (hparams.n_layer) {
+        case 48: type = LLM_TYPE_30B_A3B; break;
+        case 94: type = LLM_TYPE_235B_A22B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_qwen3moe::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_gqa, n_embd_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+
+        if (n_expert == 0) {
+            throw std::runtime_error("n_expert must be > 0 for QWEN3MOE");
+        }
+        if (n_expert_used == 0) {
+            throw std::runtime_error("n_expert_used must be > 0 for QWEN3MOE");
+        }
+
+        // MoE branch
+        const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
+
+        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
+        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_qwen3moe::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_qwen3moe::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/qwen3next.cpp b/src/models/qwen3next.cpp
index 1beda70b7cf0..cb1b4814caf3 100644
--- a/src/models/qwen3next.cpp
+++ b/src/models/qwen3next.cpp
@@ -1,8 +1,113 @@
 #include "models.h"
-
 #include "llama-memory-recurrent.h"
 
-llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_graph_params & params) :
+void llama_model_qwen3next::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp, false);
+    ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
+
+    // Load linear attention (gated delta net) parameters
+    ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
+    ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
+    ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
+    ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
+    ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
+
+    // Mark recurrent layers (linear attention layers)
+    {
+        uint32_t full_attn_interval = 4;
+        ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false);
+        for (uint32_t i = 0; i < hparams.n_layer; ++i) {
+            hparams.recurrent_layer_arr[i] = ((i + 1) % full_attn_interval != 0);
+        }
+    }
+
+    switch (hparams.n_layer) {
+        case 48: type = LLM_TYPE_80B_A3B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_qwen3next::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    if (n_expert == 0) {
+        throw std::runtime_error(arch_name() + " model cannot have zero experts");
+    }
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
+    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
+
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
+    }
+
+    const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
+
+    // Calculate dimensions from hyperparameters
+    const int64_t head_k_dim = hparams.ssm_d_state;
+    const int64_t head_v_dim = hparams.ssm_d_state;
+    const int64_t n_k_heads  = hparams.ssm_n_group;
+    const int64_t n_v_heads  = hparams.ssm_dt_rank;
+    const int64_t key_dim    = head_k_dim * n_k_heads;
+    const int64_t value_dim  = head_v_dim * n_v_heads;
+    const int64_t conv_dim   = key_dim * 2 + value_dim;
+
+    // Calculate projection sizes
+    const int64_t qkvz_dim = key_dim * 2 + value_dim * 2;
+    const int64_t ba_dim   = n_v_heads * 2;
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+        const uint32_t n_ff_shexp = hparams.n_ff_shexp > 0 ? hparams.n_ff_shexp : hparams.n_ff(i);
+
+        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), { n_embd }, 0);
+        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
+
+        if (!hparams.is_recurrent(i)) {
+            // Attention layers
+            create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0);
+            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
+
+            // Q/K normalization for attention layers
+            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
+            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
+        } else {
+            // Linear attention (gated delta net) specific tensors
+            // Create tensors with calculated dimensions
+            // note: ssm_in is used by legacy GGUF
+            layer.ssm_in         = create_tensor(tn(LLM_TENSOR_SSM_IN,         "weight", i), { n_embd, qkvz_dim }, TENSOR_NOT_REQUIRED);
+            layer.wqkv           = create_tensor(tn(LLM_TENSOR_ATTN_QKV,       "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
+            layer.wqkv_gate      = create_tensor(tn(LLM_TENSOR_ATTN_GATE,      "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
+            layer.ssm_conv1d     = create_tensor(tn(LLM_TENSOR_SSM_CONV1D,     "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
+            layer.ssm_dt         = create_tensor(tn(LLM_TENSOR_SSM_DT,         "bias",   i), { hparams.ssm_dt_rank }, 0);
+            layer.ssm_a          = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN,             i), { hparams.ssm_dt_rank }, 0);
+            layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_dim }, 0);
+            layer.ssm_norm       = create_tensor(tn(LLM_TENSOR_SSM_NORM,       "weight", i), { head_v_dim }, 0);
+            layer.ssm_out        = create_tensor(tn(LLM_TENSOR_SSM_OUT,        "weight", i), { value_dim, n_embd }, 0);
+        }
+
+        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), { n_embd, n_expert }, 0);
+        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
+        create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0);
+
+        // Shared experts
+        layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
+        layer.ffn_gate_shexp     = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP,     "weight", i), { n_embd, n_ff_shexp }, 0);
+        layer.ffn_up_shexp       = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,       "weight", i), { n_embd, n_ff_shexp }, 0);
+        layer.ffn_down_shexp     = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP,     "weight", i), { n_ff_shexp, n_embd }, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_qwen3next::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_qwen3next::graph::graph(const llama_model & model, const llm_graph_params & params) :
     llm_build_delta_net_base(params), model(model) {
     ggml_tensor * cur;
     ggml_tensor * inpL;
@@ -87,7 +192,7 @@ static ggml_tensor * get_slice_2d(ggml_context * ctx0, ggml_tensor * t, int64_t
         t->nb[1], t->nb[2], t->nb[3], t->nb[2] * c);
 }
 
-ggml_tensor * llm_build_qwen3next::build_norm_gated(
+ggml_tensor * llama_model_qwen3next::graph::build_norm_gated(
         ggml_tensor * input,
         ggml_tensor * weights,
         ggml_tensor * gate,
@@ -98,7 +203,7 @@ ggml_tensor * llm_build_qwen3next::build_norm_gated(
     return ggml_mul(ctx0, normalized, gated_silu);
 }
 
-ggml_tensor * llm_build_qwen3next::build_layer_attn(
+ggml_tensor * llama_model_qwen3next::graph::build_layer_attn(
         llm_graph_input_attn_kv * inp,
         ggml_tensor *             cur,
         ggml_tensor *             inp_pos,
@@ -178,7 +283,7 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn(
     return cur;
 }
 
-std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_qkvz(
+std::pair<ggml_tensor *, ggml_tensor *> llama_model_qwen3next::graph::build_qkvz(
                 ggml_tensor * input,
                         int   il) {
     const int64_t d_inner      = hparams.ssm_d_inner;
@@ -259,7 +364,7 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_qkvz(
     }
 }
 
-ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
+ggml_tensor * llama_model_qwen3next::graph::build_layer_attn_linear(
         llm_graph_input_rs * inp,
         ggml_tensor *        cur,
         int                  il) {
@@ -468,7 +573,7 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
     return cur;
 }
 
-ggml_tensor * llm_build_qwen3next::build_layer_ffn(ggml_tensor * cur, const int il) {
+ggml_tensor * llama_model_qwen3next::graph::build_layer_ffn(ggml_tensor * cur, const int il) {
     // Check if this is an MoE layer
     if (model.layers[il].ffn_gate_inp != nullptr) {
         // MoE branch
diff --git a/src/models/qwen3vl.cpp b/src/models/qwen3vl.cpp
index faa5f2ef3c85..7871f8f7952c 100644
--- a/src/models/qwen3vl.cpp
+++ b/src/models/qwen3vl.cpp
@@ -1,6 +1,56 @@
 #include "models.h"
 
-llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_qwen3vl::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false);
+    ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    switch (hparams.n_layer) {
+        case 28: type = LLM_TYPE_1_7B; break;
+        case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
+        case 64: type = LLM_TYPE_32B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_qwen3vl::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    // output rerank head
+    cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_gqa, n_embd_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_qwen3vl::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_qwen3vl::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const size_t n_deepstack_layers = hparams.n_deepstack_layers;
 
     const int64_t n_embd      = hparams.n_embd;
diff --git a/src/models/qwen3vl-moe.cpp b/src/models/qwen3vlmoe.cpp
similarity index 57%
rename from src/models/qwen3vl-moe.cpp
rename to src/models/qwen3vlmoe.cpp
index 29ee8278a4d9..b99143c8908a 100644
--- a/src/models/qwen3vl-moe.cpp
+++ b/src/models/qwen3vlmoe.cpp
@@ -1,6 +1,66 @@
 #include "models.h"
 
-llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_qwen3vlmoe::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false);
+    ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    switch (hparams.n_layer) {
+        case 48: type = LLM_TYPE_30B_A3B; break;
+        case 94: type = LLM_TYPE_235B_A22B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_qwen3vlmoe::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_gqa, n_embd_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+
+        if (n_expert == 0) {
+            throw std::runtime_error("n_expert must be > 0 for QWEN3MOE");
+        }
+        if (n_expert_used == 0) {
+            throw std::runtime_error("n_expert_used must be > 0 for QWEN3MOE");
+        }
+
+        // MoE branch
+        const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
+
+        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
+        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_qwen3vlmoe::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_qwen3vlmoe::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const size_t n_deepstack_layers = hparams.n_deepstack_layers;
 
     const int64_t n_embd      = hparams.n_embd;
@@ -127,4 +187,3 @@ llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_
 
     ggml_build_forward_expand(gf, cur);
 }
-
diff --git a/src/models/refact.cpp b/src/models/refact.cpp
index 398eb368db09..f14f10917ffc 100644
--- a/src/models/refact.cpp
+++ b/src/models/refact.cpp
@@ -1,6 +1,81 @@
 #include "models.h"
 
-llm_build_refact::llm_build_refact(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_refact::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    switch (hparams.n_layer) {
+        case 32: type = LLM_TYPE_1B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+
+    // TODO: become GGUF KV parameter
+    hparams.f_max_alibi_bias = 8.0f;
+}
+
+void llama_model_refact::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+        // optional bias tensors
+        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
+            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+        }
+        else {
+            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+        }
+
+        if (n_expert == 0) {
+            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+
+            // optional MLP bias
+            layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+        } else {
+            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
+            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, TENSOR_NOT_REQUIRED);
+            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
+            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
+
+            // For Granite MoE Shared
+            if (hparams.n_ff_shexp > 0) {
+                layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
+                layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
+                layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
+            }
+        }
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_refact::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_refact::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/rnd1.cpp b/src/models/rnd1.cpp
index a917c19f25a6..325ee73ba5c7 100644
--- a/src/models/rnd1.cpp
+++ b/src/models/rnd1.cpp
@@ -1,7 +1,67 @@
 #include "models.h"
 
+void llama_model_rnd1::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
+
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    switch (hparams.n_layer) {
+        case 48: type = LLM_TYPE_30B_A3B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+    // Set non-causal attention for diffusion models
+    hparams.causal_attn = false;
+}
+
+void llama_model_rnd1::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_gqa, n_embd_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+
+        if (n_expert == 0) {
+            throw std::runtime_error("n_expert must be > 0 for QWEN3MOE");
+        }
+        if (n_expert_used == 0) {
+            throw std::runtime_error("n_expert_used must be > 0 for QWEN3MOE");
+        }
+
+        // MoE branch
+        const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
+
+        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
+        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_rnd1::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
 // RND1 is a Qwen3Moe AR model converted to diffusion model.
-llm_build_rnd1::llm_build_rnd1(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+llama_model_rnd1::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/rwkv6.cpp b/src/models/rwkv6.cpp
index 032b219d6cbf..2944711acec7 100644
--- a/src/models/rwkv6.cpp
+++ b/src/models/rwkv6.cpp
@@ -1,6 +1,97 @@
 #include "models.h"
 
-llm_build_rwkv6::llm_build_rwkv6(const llama_model & model, const llm_graph_params & params) :
+void llama_model_rwkv6::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,     hparams.f_norm_eps, false);
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
+    ml.get_key(LLM_KV_WKV_HEAD_SIZE,               hparams.wkv_head_size);
+    ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM,          hparams.time_mix_extra_dim);
+    ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM,        hparams.time_decay_extra_dim);
+    ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS,      hparams.rescale_every_n_layers, false);
+    ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT,           hparams.token_shift_count, false);
+
+    switch (hparams.n_layer) {
+        case 24: type = LLM_TYPE_1_6B; break;
+        case 32:
+            switch (hparams.n_embd) {
+                case 2560: type = LLM_TYPE_3B; break;
+                case 4096: type = LLM_TYPE_7B; break;
+                default: type = LLM_TYPE_UNKNOWN;
+            } break;
+        case 61: type = LLM_TYPE_14B; break;
+        case 64: type = LLM_TYPE_32B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_rwkv6::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // Block 0, LN0
+    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight", 0), {n_embd}, 0);
+    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias",   0), {n_embd}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+    const int time_mix_extra_dim = hparams.time_mix_extra_dim;
+    const int time_decay_extra_dim = hparams.time_decay_extra_dim;
+    const int head_size = hparams.wkv_head_size;
+    const int attn_hidden_size = n_embd;
+    const int ffn_size = hparams.n_ff_arr[0];
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
+
+        layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
+        layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, 0);
+
+        layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
+        layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
+
+        layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
+        layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
+        layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
+        layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
+        layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
+        layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
+        layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, TENSOR_NOT_REQUIRED);
+        GGML_ASSERT(!(layer.time_mix_lerp_fused == NULL && layer.time_mix_lerp_w == NULL));
+
+        layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0);
+        layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
+        layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
+        layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
+        layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
+        layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
+        layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
+        layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
+
+        layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
+        layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
+        layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
+
+        layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
+        layer.channel_mix_lerp_r = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);
+
+        layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
+        layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
+        layer.channel_mix_receptance = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd}, 0);
+    }
+
+}
+
+std::unique_ptr<llm_graph_context> llama_model_rwkv6::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_rwkv6::graph::graph(const llama_model & model, const llm_graph_params & params) :
     llm_build_rwkv6_base(model, params) {
     GGML_ASSERT(hparams.token_shift_count == 2);
 
diff --git a/src/models/rwkv6qwen2.cpp b/src/models/rwkv6qwen2.cpp
index e84e59738207..6f7d1f5722f9 100644
--- a/src/models/rwkv6qwen2.cpp
+++ b/src/models/rwkv6qwen2.cpp
@@ -1,6 +1,87 @@
 #include "models.h"
 
-llm_build_rwkv6qwen2::llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv6_base(model, params) {
+void llama_model_rwkv6qwen2::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,     hparams.f_norm_eps, false);
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
+    ml.get_key(LLM_KV_WKV_HEAD_SIZE,               hparams.wkv_head_size);
+    ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM,          hparams.time_mix_extra_dim);
+    ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM,        hparams.time_decay_extra_dim);
+    ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS,      hparams.rescale_every_n_layers, false);
+    ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT,           hparams.token_shift_count, false);
+
+    switch (hparams.n_layer) {
+        case 24: type = LLM_TYPE_1_6B; break;
+        case 32:
+            switch (hparams.n_embd) {
+                case 2560: type = LLM_TYPE_3B; break;
+                case 4096: type = LLM_TYPE_7B; break;
+                default: type = LLM_TYPE_UNKNOWN;
+            } break;
+        case 61: type = LLM_TYPE_14B; break;
+        case 64: type = LLM_TYPE_32B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_rwkv6qwen2::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
+    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+    const int time_mix_extra_dim = hparams.time_mix_extra_dim;
+    const int time_decay_extra_dim = hparams.time_decay_extra_dim;
+    const int head_size = hparams.wkv_head_size;
+    const int attn_hidden_size = n_embd;
+    int attn_key_value_size;
+    if (n_head_kv == 0 || attn_hidden_size / head_size == n_head_kv) {
+        attn_key_value_size = attn_hidden_size;
+    } else {
+        attn_key_value_size = n_head_kv * head_size;
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
+        layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
+
+        layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
+        layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
+
+        layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, TENSOR_NOT_REQUIRED);
+        layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
+        layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
+        layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
+        layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {n_embd, attn_key_value_size}, 0);
+        layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {n_embd, attn_key_value_size}, 0);
+        layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
+        layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
+        // optional bias tensors
+        layer.time_mix_key_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
+        layer.time_mix_value_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
+        layer.time_mix_receptance_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "bias", i), {attn_hidden_size}, TENSOR_NOT_REQUIRED);
+
+        layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_rwkv6qwen2::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_rwkv6qwen2::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv6_base(model, params) {
     GGML_ASSERT(n_embd == hparams.n_embd_r());
 
     ggml_tensor * cur;
diff --git a/src/models/rwkv7.cpp b/src/models/rwkv7.cpp
index 16ffa6901b95..b205e3935e1d 100644
--- a/src/models/rwkv7.cpp
+++ b/src/models/rwkv7.cpp
@@ -1,6 +1,127 @@
 #include "models.h"
 
-llm_build_rwkv7::llm_build_rwkv7(const llama_model & model, const llm_graph_params & params) :
+void llama_model_rwkv7::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,                hparams.f_norm_eps, false);
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,            hparams.f_norm_rms_eps, false);
+    ml.get_key(LLM_KV_WKV_HEAD_SIZE,                          hparams.wkv_head_size);
+    ml.get_key(LLM_KV_ATTENTION_DECAY_LORA_RANK,              hparams.n_lora_decay);
+    ml.get_key(LLM_KV_ATTENTION_ICLR_LORA_RANK,               hparams.n_lora_iclr);
+    ml.get_key(LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK, hparams.n_lora_value_res_mix);
+    ml.get_key(LLM_KV_ATTENTION_GATE_LORA_RANK,               hparams.n_lora_gate, false);
+    ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT,                      hparams.token_shift_count, false);
+
+    switch (hparams.n_layer) {
+        case 12:
+            switch (hparams.n_embd) {
+                case 768: type = LLM_TYPE_190M; break;
+                default: type = LLM_TYPE_UNKNOWN;
+            } break;
+        case 24:
+            switch (hparams.n_embd) {
+                case 1024: type = LLM_TYPE_450M; break;
+                case 2048: type = LLM_TYPE_1_5B; break;
+                default: type = LLM_TYPE_UNKNOWN;
+            } break;
+        case 28:
+            switch (hparams.n_embd) {
+                case 1536: type = LLM_TYPE_1_5B; break;
+                case 3584: type = LLM_TYPE_7B; break;
+                default: type = LLM_TYPE_UNKNOWN;
+            } break;
+        case 32:
+            switch (hparams.n_embd) {
+                case 2560: type = LLM_TYPE_2_9B; break;
+                case 4096: type = LLM_TYPE_7B; break;
+                default: type = LLM_TYPE_UNKNOWN;
+            } break;
+        case 61:
+            switch (hparams.n_embd) {
+                case 4096: type = LLM_TYPE_14B; break;
+                default: type = LLM_TYPE_UNKNOWN;
+            } break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_rwkv7::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // Block 0, LN0
+    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight", 0), {n_embd}, 0);
+    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias",   0), {n_embd}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+    const int n_lora_decay = hparams.n_lora_decay;
+    const int n_lora_iclr = hparams.n_lora_iclr;
+    const int n_lora_value_res_mix = hparams.n_lora_value_res_mix;
+    const int n_lora_gate = hparams.n_lora_gate;
+    const int attn_hidden_size = n_embd;
+    const int ffn_size = hparams.n_ff_arr[0];
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
+
+        layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
+        layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, 0);
+
+        layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0);
+        layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0);
+        layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0);
+
+        layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0);
+        layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0);
+        layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0);
+
+        if (i == 0) {
+            // actually not used
+            layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
+            layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0);
+            layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0);
+        } else {
+            layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0);
+            layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0);
+            layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
+        }
+
+        layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, 0);
+        layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, 0);
+
+        layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
+
+        layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0);
+        layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0);
+        layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0);
+
+        layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
+        layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
+        layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
+
+        layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
+        layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
+        layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
+
+        layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
+
+        layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
+        layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
+    }
+
+}
+
+std::unique_ptr<llm_graph_context> llama_model_rwkv7::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_rwkv7::graph::graph(const llama_model & model, const llm_graph_params & params) :
     llm_build_rwkv7_base(model, params) {
     GGML_ASSERT(hparams.token_shift_count == 2);
 
diff --git a/src/models/seed-oss.cpp b/src/models/seed-oss.cpp
index 6db8d9781fe0..83e114740b62 100644
--- a/src/models/seed-oss.cpp
+++ b/src/models/seed-oss.cpp
@@ -1,6 +1,51 @@
 #include "models.h"
 
-llm_build_seed_oss::llm_build_seed_oss(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_seed_oss::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    switch (hparams.n_layer) {
+        case 64: type = LLM_TYPE_36B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_seed_oss::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    const uint32_t head_dim             = hparams.n_embd_head_k();
+    const int64_t n_qo_dim              = n_head * head_dim;
+    const int64_t n_kv_dim              = n_head_kv * head_dim;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        create_tensor_qkv(layer, i, n_embd, n_qo_dim, n_kv_dim, n_kv_dim, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, 0);
+
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_seed_oss::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_seed_oss::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/smallthinker.cpp b/src/models/smallthinker.cpp
index 55d09ec325d2..3214e7cbad38 100644
--- a/src/models/smallthinker.cpp
+++ b/src/models/smallthinker.cpp
@@ -1,7 +1,80 @@
 #include "models.h"
 
+void llama_model_smallthinker::load_arch_hparams(llama_model_loader & ml) {
+    const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+
+    if (found_swa && hparams.n_swa > 0) {
+        hparams.swa_type    = LLAMA_SWA_TYPE_STANDARD;
+        hparams.n_swa       = 4096;
+        uint32_t swa_period = 4;
+        ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+        hparams.set_swa_pattern(swa_period, true);
+
+        hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
+        hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
+        ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+    } else {
+        hparams.swa_type             = LLAMA_SWA_TYPE_NONE;
+        hparams.n_no_rope_layer_step = hparams.n_layer;
+    }
+
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp, false);
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
+
+    switch (hparams.n_layer) {
+        case 32: type = LLM_TYPE_4B;  break;
+        case 52: type = LLM_TYPE_20B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_smallthinker::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_gqa, n_embd_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
+
+        GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for SMALLTHINKER");
+        GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for SMALLTHINKER");
+
+        // MoE branch
+        const int64_t n_ff_exp = hparams.n_ff_exp;
+        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
+        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
+        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
+        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_smallthinker::build_arch_graph(const llm_graph_params & params) const {
+    if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
+        return std::make_unique<graph<true>> (*this, params);
+    } else {
+        return std::make_unique<graph<false>>(*this, params);
+    }
+}
+
 template <bool iswa>
-llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params){
+llama_model_smallthinker::graph<iswa>::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params){
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
@@ -113,5 +186,5 @@ llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model,
 }
 
 // Explicit template instantiations
-template struct llm_build_smallthinker<false>;
-template struct llm_build_smallthinker<true>;
+template struct llama_model_smallthinker::graph<false>;
+template struct llama_model_smallthinker::graph<true>;
diff --git a/src/models/smollm3.cpp b/src/models/smollm3.cpp
index 83636dbf546e..7adaf34c5345 100644
--- a/src/models/smollm3.cpp
+++ b/src/models/smollm3.cpp
@@ -1,6 +1,49 @@
 #include "models.h"
 
-llm_build_smollm3::llm_build_smollm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_smollm3::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    hparams.n_no_rope_layer_step = 4;
+
+    switch (hparams.n_layer) {
+        case 36: type = LLM_TYPE_3B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_smollm3::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_smollm3::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_smollm3::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/stablelm.cpp b/src/models/stablelm.cpp
index 9c19abd8835c..8f613e559470 100644
--- a/src/models/stablelm.cpp
+++ b/src/models/stablelm.cpp
@@ -1,6 +1,54 @@
 #include "models.h"
 
-llm_build_stablelm::llm_build_stablelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_stablelm::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+
+    switch (hparams.n_layer) {
+        case 24: type = LLM_TYPE_1B; break;
+        case 32: type = LLM_TYPE_3B; break;
+        case 40: type = LLM_TYPE_12B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+   }
+}
+
+void llama_model_stablelm::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm =   create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+        // optional q and k layernorms, present in StableLM 2 12B
+        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head},    TENSOR_NOT_REQUIRED);
+        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
+
+        // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
+        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, TENSOR_NOT_REQUIRED);
+
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_stablelm::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_stablelm::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/starcoder.cpp b/src/models/starcoder.cpp
index cf9fe95c35bf..58cf0ac0edc1 100644
--- a/src/models/starcoder.cpp
+++ b/src/models/starcoder.cpp
@@ -1,6 +1,62 @@
 #include "models.h"
 
-llm_build_starcoder::llm_build_starcoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_starcoder::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+    switch (hparams.n_layer) {
+        case 24: type = LLM_TYPE_1B; break;
+        case 36: type = LLM_TYPE_3B; break;
+        case 42: type = LLM_TYPE_7B; break;
+        case 40: type = LLM_TYPE_15B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_starcoder::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+    pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,   "weight"), {n_embd, n_ctx_train}, 0);
+
+    // output
+    {
+        output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+        output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+        output        = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+        if (!output) {
+            // needs to be on GPU
+            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+        }
+
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
+
+        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+        layer.wqkv_b = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
+
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+
+        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
+
+        layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, 0);
+
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i),   {n_embd, n_ff}, 0);
+        layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i),     {n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_starcoder::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_starcoder::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/starcoder2.cpp b/src/models/starcoder2.cpp
index b6d4d5aac1ab..45dae0602d44 100644
--- a/src/models/starcoder2.cpp
+++ b/src/models/starcoder2.cpp
@@ -1,6 +1,61 @@
 #include "models.h"
 
-llm_build_starcoder2::llm_build_starcoder2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_starcoder2::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+    switch (hparams.n_layer) {
+        case 30: type = LLM_TYPE_3B; break;
+        case 32: type = LLM_TYPE_7B; break;
+        case 40: type = LLM_TYPE_15B; break;
+        case 52: type = LLM_TYPE_20B; break; // granite
+        case 88: type = LLM_TYPE_34B; break; // granite
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_starcoder2::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+
+    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+        // optional bias tensors
+        layer.wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+
+        layer.ffn_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, 0);
+
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+
+        // optional bias tensors
+        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
+        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP ,  "bias", i), {  n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_starcoder2::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_starcoder2::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/step35-iswa.cpp b/src/models/step35.cpp
similarity index 52%
rename from src/models/step35-iswa.cpp
rename to src/models/step35.cpp
index 86aa98909e74..c4789752d21b 100644
--- a/src/models/step35-iswa.cpp
+++ b/src/models/step35.cpp
@@ -1,6 +1,108 @@
 #include "models.h"
 
-llm_build_step35_iswa::llm_build_step35_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_step35::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+
+    // full_attention layer only use half of the RoPE dimensions
+    hparams.n_rot_full = hparams.n_rot_full / 2;
+
+    // MoE + SWA parameters
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
+    ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
+    ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func, false);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,               hparams.expert_weights_norm, false);
+
+    // Step35 uses sigmoid gating by default (if not set in GGUF)
+    if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
+        hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
+    }
+
+    ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,  hparams.n_swa);
+    ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,        hparams.rope_freq_base_train_swa, false);
+    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
+    ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP,   hparams.swiglu_clamp_exp,   hparams.n_layer, false);
+    ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp, hparams.n_layer, false);
+
+    switch (hparams.n_layer) {
+        case 45: type = LLM_TYPE_196B_A11B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_step35::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+    // STEP35 supports per-layer partial RoPE dims; rope factors are stored as a single shared tensor
+    // ("rope_freqs.weight") and ggml uses only the first (n_rot_l/2) entries per layer.
+    uint32_t n_rot_max = 0;
+    for (int i = 0; i < n_layer; ++i) {
+        n_rot_max = std::max(n_rot_max, hparams.n_rot(i));
+    }
+    if (n_rot_max == 0) {
+        n_rot_max = n_rot;
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        const uint32_t n_head_l      = hparams.n_head(i);
+        const uint32_t n_embd_k_gqa  = hparams.n_embd_k_gqa(i);
+        const uint32_t n_embd_v_gqa  = hparams.n_embd_v_gqa(i);
+
+        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED);
+        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED);
+
+        // optional rope factors (llama3) / longrope tensors
+        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
+            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+        } else {
+            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+        }
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head_l, n_embd_k_gqa, n_embd_v_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v * n_head_l, n_embd}, 0);
+
+        // head-wise attention gate (Step35 self_attn.g_proj)
+        layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_head_l}, TENSOR_NOT_REQUIRED);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+        // dense MLP (leading dense blocks)
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, TENSOR_NOT_REQUIRED);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
+
+        // MoE routed experts + selection bias (router_bias)
+        const int64_t n_ff_exp = hparams.n_ff_exp;
+        layer.ffn_gate_inp      = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
+        layer.ffn_gate_exps     = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp,   n_expert}, TENSOR_NOT_REQUIRED);
+        layer.ffn_down_exps     = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, TENSOR_NOT_REQUIRED);
+        layer.ffn_up_exps       = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff_exp,   n_expert}, TENSOR_NOT_REQUIRED);
+        layer.ffn_exp_probs_b   = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
+
+        // shared expert MLP
+        layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED);
+        layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED);
+        layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, TENSOR_NOT_REQUIRED);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_step35::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_step35::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     ggml_tensor * cur;
     ggml_tensor * inpL;
 
diff --git a/src/models/t5.cpp b/src/models/t5.cpp
index 9f9dfef40123..27a0711ba414 100644
--- a/src/models/t5.cpp
+++ b/src/models/t5.cpp
@@ -1,7 +1,125 @@
 #include "models.h"
 
+void llama_model_t5::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,      hparams.f_norm_rms_eps);
+    ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
+
+    uint32_t dec_start_token_id;
+    if (ml.get_key(LLM_KV_DECODER_START_TOKEN_ID, dec_start_token_id, false)) {
+        hparams.dec_start_token_id = dec_start_token_id;
+    }
+
+    hparams.dec_n_layer = hparams.n_layer;
+    ml.get_key(LLM_KV_DECODER_BLOCK_COUNT, hparams.dec_n_layer, false);
+
+    switch (hparams.n_layer) {
+        case 6:  type = LLM_TYPE_60M;  break; // t5-small
+        case 8:  type = LLM_TYPE_80M;  break; // flan-t5-small
+        case 12:
+            switch (hparams.n_ff()) {
+                case 3072: type = LLM_TYPE_220M; break; // t5-base
+                case 2048: type = LLM_TYPE_250M; break; // flan-t5-base
+                default: type = LLM_TYPE_UNKNOWN;
+            } break;
+        case 24:
+            switch (hparams.n_ff()) {
+                case 4096:  type = LLM_TYPE_770M; break; // t5-large
+                case 2816:  type = LLM_TYPE_780M; break; // flan-t5-large
+                case 16384: type = LLM_TYPE_3B;   break; // t5-3b
+                case 5120:  type = LLM_TYPE_3B;   break; // flan-t5-xl
+                case 65536: type = LLM_TYPE_11B;  break; // t5-11b
+                case 10240: type = LLM_TYPE_11B;  break; // flan-t5-xxl
+                default: type = LLM_TYPE_UNKNOWN;
+            } break;
+        default: type = LLM_TYPE_UNKNOWN;
+   }
+}
+
+void llama_model_t5::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output_norm     = create_tensor(tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd}, 0);
+
+    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    // n_layer:     number of encoder_layers
+    // dec_n_layer: number of decoder_layers
+    const int dec_n_layer = hparams.dec_n_layer;
+    if (dec_n_layer > n_layer) {
+        layers.resize(dec_n_layer);
+    }
+
+    // load encoder layers
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm_enc  = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM,  "weight", i), {n_embd}, 0);
+        layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
+
+        layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+        layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+        layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
+        layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
+
+        layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
+        layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up_enc   = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+    }
+
+    // load decoder layers
+    for (int i = 0; i < dec_n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm  = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM,  "weight", i), {n_embd}, 0);
+        layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
+
+        layer.wq = create_tensor(tn(LLM_TENSOR_DEC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+        layer.wk = create_tensor(tn(LLM_TENSOR_DEC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+        layer.wv = create_tensor(tn(LLM_TENSOR_DEC_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
+
+        layer.attn_norm_cross  = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM,  "weight", i), {n_embd}, 0);
+        // this tensor seems to be unused in HF transformers implementation
+        layer.attn_rel_b_cross = create_tensor(
+            tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
+
+        layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+        layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+        layer.wv_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
+        layer.wo_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_DEC_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_t5::build_arch_graph(const llm_graph_params & params) const {
+    switch (params.gtype) {
+        case LLM_GRAPH_TYPE_ENCODER:
+            return std::make_unique<graph<true>>(*this, params);
+        case LLM_GRAPH_TYPE_DEFAULT:
+        case LLM_GRAPH_TYPE_DECODER:
+            return std::make_unique<graph<false>>(*this, params);
+        default:
+            GGML_ABORT("invalid graph type");
+    };
+}
+
 template <>
-llm_build_t5<false>::llm_build_t5(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+llama_model_t5::graph<false>::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
     //const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
 
@@ -156,7 +274,7 @@ llm_build_t5<false>::llm_build_t5(const llama_model & model, const llm_graph_par
 }
 
 template <>
-llm_build_t5<true>::llm_build_t5(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+llama_model_t5::graph<true>::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
diff --git a/src/models/t5encoder.cpp b/src/models/t5encoder.cpp
index 5c1f9eb40300..23c5f9b6a1ca 100644
--- a/src/models/t5encoder.cpp
+++ b/src/models/t5encoder.cpp
@@ -1,3 +1,44 @@
 #include "models.h"
 
-llm_build_t5encoder::llm_build_t5encoder(const llama_model & model, const llm_graph_params & params) : llm_build_t5<true>(model, params) {}
+void llama_model_t5encoder::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
+    type = LLM_TYPE_UNKNOWN;
+}
+
+void llama_model_t5encoder::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+    // if output is NULL, init from the input tok embed
+    if (output == NULL) {
+        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+    }
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm_enc  = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM,  "weight", i), {n_embd}, 0);
+        layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
+
+        layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+        layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+        layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
+        layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
+
+        layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd,   n_ff}, TENSOR_NOT_REQUIRED);
+        layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up_enc   = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_t5encoder::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
diff --git a/src/models/wavtokenizer-dec.cpp b/src/models/wavtokenizer-dec.cpp
index a7776d9cdc97..a873e5d2e8fa 100644
--- a/src/models/wavtokenizer-dec.cpp
+++ b/src/models/wavtokenizer-dec.cpp
@@ -1,6 +1,121 @@
 #include "models.h"
 
-llm_build_wavtokenizer_dec::llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_wavtokenizer_dec::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
+    ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS,    hparams.f_norm_group_eps);
+    ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
+}
+
+void llama_model_wavtokenizer_dec::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd, n_vocab}, 0);
+
+    conv1d   = create_tensor(tn(LLM_TENSOR_CONV1D, "weight", 0), {7, hparams.n_embd, hparams.posnet.n_embd}, 0);
+    conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias",   0), {1, hparams.posnet.n_embd}, 0);
+
+    // posnet
+    {
+        const int64_t n_embd = hparams.posnet.n_embd;
+
+        for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
+            auto & layer = layers[i].posnet;
+
+            // posnet:
+            //
+            //  - resnet
+            //  - resnet
+            //  - attn
+            //  - resnet
+            //  - resnet
+            //  - norm
+            //
+            switch (i) {
+                case 0:
+                case 1:
+                case 3:
+                case 4:
+                    {
+                        layer.norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
+                        layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   i), {1, n_embd}, 0);
+
+                        layer.conv1   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
+                        layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   i), {1, n_embd}, 0);
+
+                        layer.norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
+                        layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   i), {1, n_embd}, 0);
+
+                        layer.conv2   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
+                        layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   i), {1, n_embd}, 0);
+                    } break;
+                case 2:
+                    {
+                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
+                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   i), {1, n_embd}, 0);
+
+                        layer.attn_q      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q,    "weight", i), {1, n_embd, n_embd}, 0);
+                        layer.attn_q_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q,    "bias",   i), {1, n_embd}, 0);
+
+                        layer.attn_k      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K,    "weight", i), {1, n_embd, n_embd}, 0);
+                        layer.attn_k_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K,    "bias",   i), {1, n_embd}, 0);
+
+                        layer.attn_v      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V,    "weight", i), {1, n_embd, n_embd}, 0);
+                        layer.attn_v_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V,    "bias",   i), {1, n_embd}, 0);
+
+                        layer.attn_o      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT,  "weight", i), {1, n_embd, n_embd}, 0);
+                        layer.attn_o_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT,  "bias",   i), {1, n_embd}, 0);
+                    } break;
+                case 5:
+                    {
+                        layer.norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
+                        layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   i), {1, n_embd}, 0);
+                    } break;
+                default: GGML_ABORT("unknown posnet layer");
+            };
+        }
+    }
+
+    GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
+
+    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight", 0), {hparams.posnet.n_embd}, 0);
+    tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias",   0), {hparams.posnet.n_embd}, 0);
+
+    // convnext
+    {
+        const int64_t n_embd = hparams.convnext.n_embd;
+
+        for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
+            auto & layer = layers[i].convnext;
+
+            layer.dw     = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW,    "weight", i), {7, 1, n_embd}, 0);
+            layer.dw_b   = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW,    "bias",   i), {1, n_embd}, 0);
+
+            layer.norm   = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM,  "weight", i), {n_embd}, 0);
+            layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM,  "bias",   i), {n_embd}, 0);
+
+            layer.pw1    = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1,   "weight", i), {n_embd, n_ff}, 0);
+            layer.pw1_b  = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1,   "bias",   i), {n_ff}, 0);
+
+            layer.pw2    = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2,   "weight", i), {n_ff, n_embd}, 0);
+            layer.pw2_b  = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2,   "bias",   i), {n_embd}, 0);
+
+            layer.gamma  = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
+        }
+
+        // output
+        output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+        output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+    }
+
+    output   = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, hparams.n_embd_out()}, 0);
+    output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"),   {hparams.n_embd_out()}, 0);
+}
+
+std::unique_ptr<llm_graph_context> llama_model_wavtokenizer_dec::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_wavtokenizer_dec::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     ggml_tensor * cur;
     ggml_tensor * inpL;
 
diff --git a/src/models/xverse.cpp b/src/models/xverse.cpp
index 53085ec80f6b..e4d111e622ac 100644
--- a/src/models/xverse.cpp
+++ b/src/models/xverse.cpp
@@ -1,6 +1,43 @@
 #include "models.h"
 
-llm_build_xverse::llm_build_xverse(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+void llama_model_xverse::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    switch (hparams.n_layer) {
+        case 32: type = LLM_TYPE_7B; break;
+        case 40: type = LLM_TYPE_13B; break;
+        case 80: type = LLM_TYPE_65B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_xverse::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_xverse::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_xverse::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());

From a4701c98f72160144b101090596c7ea1ef0c1d7b Mon Sep 17 00:00:00 2001
From: "Piotr Wilkin (ilintar)" <piotr.wilkin@syndatis.com>
Date: Mon, 4 May 2026 13:18:11 +0200
Subject: [PATCH 08/13] common/autoparser: fixes for newline handling / forced
 tool calls (#22654)

* chat/autoparser: the fixes

* Move optspace() to chat-peg-parser, comment out server tests invalidated due to content now allowed with forced tool calls.

* Trim whitespace on apply instead
---
 common/chat-auto-parser-generator.cpp     |  16 +-
 common/chat-diff-analyzer.cpp             |   8 +-
 common/chat-peg-parser.cpp                |  26 ++
 common/chat-peg-parser.h                  |   3 +
 common/chat.cpp                           |   4 +-
 common/reasoning-budget.cpp               |   2 +
 scripts/server-test-function-call.py      |  32 ++-
 tests/test-chat.cpp                       | 277 +++++++++++++++++++++-
 tests/test-reasoning-budget.cpp           |  14 +-
 tools/server/tests/unit/test_tool_call.py | 127 +++++-----
 10 files changed, 402 insertions(+), 107 deletions(-)

diff --git a/common/chat-auto-parser-generator.cpp b/common/chat-auto-parser-generator.cpp
index 453559a4b04e..2fac0ea5c002 100644
--- a/common/chat-auto-parser-generator.cpp
+++ b/common/chat-auto-parser-generator.cpp
@@ -136,10 +136,10 @@ common_peg_parser analyze_reasoning::build_parser(parser_build_context & ctx) co
         if (!end.empty()) {
             if (!start.empty()) {
                 // Standard tag-based: optional(<think>reasoning</think>)
-                return p.optional(start + p.reasoning(p.until(end)) + end + p.space());
+                return p.optional(p.optspace(start) + p.reasoning(p.until(trim_whitespace(end))) + p.optspace(end));
             }
             // Delimiter-style (empty start)
-            return p.optional(p.reasoning(p.until(end)) + end + p.space());
+            return p.optional(p.reasoning(p.until(trim_whitespace(end))) + p.optspace(end));
         }
     }
 
@@ -186,7 +186,6 @@ common_peg_parser analyze_tools::build_parser(parser_build_context & ctx) const
 common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_context & ctx) const {
     auto &       p           = ctx.p;
     const auto & inputs      = ctx.inputs;
-    bool         force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
 
     // Build effective field names with dot notation if function_field is set
     std::string name_field = format.name_field;
@@ -225,8 +224,7 @@ common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_cont
         tool_start = format.per_call_start;
     }
 
-    return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(p.until(tool_start)))) + tools_parser +
-           p.end();
+    return ctx.reasoning_parser + p.optional(p.content(p.until(tool_start))) + tools_parser + p.end();
 }
 
 common_peg_parser analyze_tools::build_func_parser(common_chat_peg_builder & p, const std::string & name,
@@ -270,7 +268,6 @@ common_peg_parser analyze_tools::build_func_parser(common_chat_peg_builder & p,
 common_peg_parser analyze_tools::build_tool_parser_tag_json(parser_build_context & ctx) const {
     auto &       p           = ctx.p;
     const auto & inputs      = ctx.inputs;
-    bool         force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
 
     common_peg_parser tool_choice = p.choice();
 
@@ -336,14 +333,12 @@ common_peg_parser analyze_tools::build_tool_parser_tag_json(parser_build_context
 
     std::string trigger_marker       = !format.section_start.empty() ? format.section_start : format.per_call_start;
     auto        content_before_tools = trigger_marker.empty() ? p.eps() : p.until(trigger_marker);
-    return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(content_before_tools))) + tool_calls +
-           p.end();
+    return ctx.reasoning_parser + p.optional(p.content(content_before_tools)) + tool_calls + p.end();
 }
 
 common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_context & ctx) const {
     auto &       p           = ctx.p;
     const auto & inputs      = ctx.inputs;
-    bool         force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
 
     auto until_suffix = p.rule("until-suffix", p.until(arguments.value_suffix));
 
@@ -471,8 +466,7 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
 
     std::string trigger_marker       = !format.section_start.empty() ? format.section_start : format.per_call_start;
     auto        content_before_tools = trigger_marker.empty() ? p.eps() : p.until(trigger_marker);
-    return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(content_before_tools))) + tool_calls +
-           p.end();
+    return ctx.reasoning_parser + p.optional(p.content(content_before_tools)) + tool_calls + p.end();
 }
 
 }  // namespace autoparser
diff --git a/common/chat-diff-analyzer.cpp b/common/chat-diff-analyzer.cpp
index 264ace4627c9..9c7c9678acd8 100644
--- a/common/chat-diff-analyzer.cpp
+++ b/common/chat-diff-analyzer.cpp
@@ -342,7 +342,7 @@ void analyze_reasoning::compare_thinking_enabled() {
     if (left_trimmed.empty() && !diff.right.empty()) {
         if (!right_trimmed.empty() && string_ends_with(comparison->output_B, right_trimmed)) {
             if (start.empty()) {
-                start = trim_leading_whitespace(diff.right);
+                start = diff.right;
                 mode  = reasoning_mode::TAG_BASED;
             }
         }
@@ -353,7 +353,7 @@ void analyze_reasoning::compare_thinking_enabled() {
                 if (seg.size() >= 2 && seg[seg.size() - 1].value == left_trimmed && seg[seg.size() - 2].type == segment_type::MARKER) {
                     start = seg[seg.size() - 2].value;
                 }
-                end = trim_trailing_whitespace(diff.left);
+                end = diff.left;
                 mode = reasoning_mode::TAG_BASED;
             }
         }
@@ -445,14 +445,14 @@ void analyze_reasoning::compare_reasoning_scope() {
         auto result = parser_wrapped.parse_anywhere_and_extract(comparison->output_B);
         if (result.result.success()) {
             start = result.tags["pre"];
-            end = trim_trailing_whitespace(result.tags["post"]);
+            end = result.tags["post"];
         } else {
             auto parser_delimiter = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
                 return p.literal(reasoning_content) + p.space() + p.optional(p.tag("post", (p.marker() + p.space())));
             });
             result = parser_delimiter.parse_anywhere_and_extract(comparison->output_B);
             if (result.result.success()) {
-                end = trim_trailing_whitespace(result.tags["post"]);
+                end = result.tags["post"];
             } else {
                 LOG_DBG(ANSI_ORANGE "%s: Unable to extract reasoning markers, falling back to reasoning = NONE\n" ANSI_RESET, __func__);
                 mode = reasoning_mode::NONE;
diff --git a/common/chat-peg-parser.cpp b/common/chat-peg-parser.cpp
index 56eb567df0a4..a4818859a6da 100644
--- a/common/chat-peg-parser.cpp
+++ b/common/chat-peg-parser.cpp
@@ -816,6 +816,32 @@ common_peg_parser common_chat_peg_builder::prefix(const std::string & s, const s
     return literal(s.substr(0, s.rfind(delimiter)));
 }
 
+common_peg_parser common_chat_peg_builder::optspace(const std::string & tag) {
+    auto parser = eps();
+    size_t end_of_prefix_space = tag.size();
+    size_t start_of_suffix_space = tag.size();
+    for (size_t i = 0; i < tag.size(); i++) {
+        if (!std::isspace(tag[i])) {
+            end_of_prefix_space = i;
+            break;
+        }
+    }
+    for (size_t i = tag.size(); i > 0; i--) {
+        if (!std::isspace(tag[i - 1])) {
+            start_of_suffix_space = i;
+            break;
+        }
+    }
+    for (size_t i = 0; i < end_of_prefix_space; i++) {
+        parser += optional(literal(std::string(1, tag[i])));
+    }
+    parser += literal(tag.substr(end_of_prefix_space, start_of_suffix_space - end_of_prefix_space));
+    for (size_t i = start_of_suffix_space; i < tag.size(); i++) {
+        parser += optional(literal(std::string(1, tag[i])));
+    }
+    return parser;
+}
+
 common_peg_parser common_chat_peg_builder::standard_json_tools(
                                                        const std::string &              section_start,
                                                        const std::string &              section_end,
diff --git a/common/chat-peg-parser.h b/common/chat-peg-parser.h
index 1ea3eb7eb862..c684d773564c 100644
--- a/common/chat-peg-parser.h
+++ b/common/chat-peg-parser.h
@@ -96,6 +96,9 @@ class common_chat_peg_builder : public common_peg_parser_builder {
     // Return a parser that parses the prefix of a string, up to a given delimiter.
     common_peg_parser prefix(const std::string & s, const std::string & delimiter = {});
 
+    // Return a parser that parses all elements of tag, but leading and trailing spaces are optional
+    common_peg_parser optspace(const std::string & tag);
+
     // Legacy-compatible helper for building standard JSON tool calls
     // Used by tests and manual parsers
     // name_key/args_key: JSON key names for function name and arguments
diff --git a/common/chat.cpp b/common/chat.cpp
index ade142c5cb60..5364a6e01749 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -2221,8 +2221,8 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
         auto auto_params = autoparser::peg_generator::generate_parser(tmpl, params, autoparser);
         auto_params.supports_thinking = autoparser.reasoning.mode != autoparser::reasoning_mode::NONE;
         if (auto_params.supports_thinking) {
-            auto_params.thinking_start_tag = autoparser.reasoning.start;
-            auto_params.thinking_end_tag   = autoparser.reasoning.end;
+            auto_params.thinking_start_tag = trim_whitespace(autoparser.reasoning.start);
+            auto_params.thinking_end_tag   = trim_whitespace(autoparser.reasoning.end);
         }
         auto_params.generation_prompt = params.generation_prompt;
         common_peg_arena arena;
diff --git a/common/reasoning-budget.cpp b/common/reasoning-budget.cpp
index c6e1f86c91e9..8c1f72fc2a12 100644
--- a/common/reasoning-budget.cpp
+++ b/common/reasoning-budget.cpp
@@ -158,6 +158,8 @@ static void common_reasoning_budget_apply(struct llama_sampler * smpl, llama_tok
     for (size_t i = 0; i < cur_p->size; i++) {
         if (cur_p->data[i].id != forced) {
             cur_p->data[i].logit = -INFINITY;
+        } else {
+            cur_p->data[i].logit = +INFINITY; // force the token
         }
     }
 }
diff --git a/scripts/server-test-function-call.py b/scripts/server-test-function-call.py
index b3aae1a961e5..c32f17b5e3bf 100755
--- a/scripts/server-test-function-call.py
+++ b/scripts/server-test-function-call.py
@@ -79,7 +79,7 @@ def print_info(msg):
 # ---------------------------------------------------------------------------
 
 
-def chat_completion(url, messages, tools=None, stream=False):
+def chat_completion(url, messages, tools=None, stream=False, force_tools=False):
     payload = {
         "messages": messages,
         "stream": stream,
@@ -87,7 +87,10 @@ def chat_completion(url, messages, tools=None, stream=False):
     }
     if tools:
         payload["tools"] = tools
-        payload["tool_choice"] = "auto"
+        if force_tools:
+            payload["tool_choice"] = "required"
+        else:
+            payload["tool_choice"] = "auto"
 
     try:
         response = requests.post(url, json=payload, stream=stream)
@@ -160,7 +163,13 @@ def chat_completion(url, messages, tools=None, stream=False):
     return result
 
 
-def run_agentic_loop(url, messages, tools, mock_tool_responses, stream, max_turns=6):
+def all_tools_called(tools, all_tool_calls):
+    all_tool_names = set([tc["function"]["name"] for tc in tools])
+    all_called_tool_names = set([tc["function"]["name"] for tc in all_tool_calls])
+    return all_tool_names == all_called_tool_names
+
+
+def run_agentic_loop(url, messages, tools, mock_tool_responses, stream, max_turns=6, force_tools=False):
     """
     Drive the multi-turn tool-call loop:
       1. Send messages to model.
@@ -172,8 +181,8 @@ def run_agentic_loop(url, messages, tools, mock_tool_responses, stream, max_turn
     msgs = list(messages)
     all_tool_calls: list[dict] = []
 
-    for _ in range(max_turns):
-        result = chat_completion(url, msgs, tools=tools, stream=stream)
+    for t in range(max_turns):
+        result = chat_completion(url, msgs, tools=tools, stream=stream, force_tools=(force_tools and not all_tools_called(tools, all_tool_calls)))
         if result is None:
             return all_tool_calls, None
 
@@ -235,10 +244,10 @@ def run_agentic_loop(url, messages, tools, mock_tool_responses, stream, max_turn
 # ---------------------------------------------------------------------------
 
 
-def run_test(url, test_case, stream):
+def run_test(url, test_case, stream, force_tools):
     name = test_case["name"]
     mode = f"{'stream' if stream else 'non-stream'}"
-    print_header(f"{name}  [{mode}]")
+    print_header(f"{name} [{mode}, force_tools={force_tools}] ")
 
     all_tool_calls, final_content = run_agentic_loop(
         url,
@@ -246,6 +255,7 @@ def run_test(url, test_case, stream):
         tools=test_case["tools"],
         mock_tool_responses=test_case["mock_tool_responses"],
         stream=stream,
+        force_tools=force_tools
     )
 
     if final_content is None and not all_tool_calls:
@@ -1093,6 +1103,9 @@ def main():
     parser.add_argument(
         "--stream-only", action="store_true", help="Only run streaming mode tests"
     )
+    parser.add_argument(
+        "--force-tools", action="store_true", help="Change tool mode to forced instead of auto"
+    )
     parser.add_argument(
         "--test",
         help="Run only the test whose name contains this substring (case-insensitive)",
@@ -1103,10 +1116,13 @@ def main():
     print_info(f"Testing server at {url}")
 
     modes = []
+    force_tools = False
     if not args.stream_only:
         modes.append(False)
     if not args.no_stream:
         modes.append(True)
+    if args.force_tools:
+        force_tools = True
 
     cases: list[dict] = ALL_TEST_CASES
     if args.test:
@@ -1121,7 +1137,7 @@ def main():
     for stream in modes:
         for case in cases:
             total += 1
-            if run_test(url, case, stream=stream):
+            if run_test(url, case, stream=stream, force_tools=force_tools):
                 passed += 1
 
     color = GREEN if passed == total else RED
diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp
index e6a5236645e1..ea9d87ebed28 100644
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -542,6 +542,36 @@ static common_chat_tool edit_tool{
     })",
 };
 
+static common_chat_tool manage_todo_list_tool{
+    /* .name = */ "manage_todo_list",
+    /* .description = */ "Create or update the todo list",
+    /* .parameters = */ R"({
+        "type": "object",
+        "properties": {
+            "todos": {
+                "type": "array",
+                "description": "List of TODO list items"
+            }
+        },
+        "required": ["todos"]
+    })",
+};
+
+static common_chat_tool run_in_terminal_tool{
+    /* .name = */ "run_in_terminal",
+    /* .description = */ "Run a shell command.",
+    /* .parameters = */ R"({
+        "type": "object",
+        "properties": {
+            "command": {
+                "type": "string",
+                "description": "Shell command to run"
+            }
+        },
+        "required": ["command"]
+    })",
+};
+
 static common_chat_tool magic_tool{
     /* .name = */ "magic",
     /* .description = */ "Magic tool that takes a hash",
@@ -1379,6 +1409,16 @@ class peg_test_builder {
         return *this;
     }
 
+    peg_test_builder & tool_choice(common_chat_tool_choice choice) {
+        tc_.params.tool_choice = choice;
+        return *this;
+    }
+
+    peg_test_builder & messages(std::vector<common_chat_msg> messages) {
+        tc_.params.messages = std::move(messages);
+        return *this;
+    }
+
     // Execute the test
     void run() {
         // Check template filter
@@ -1755,23 +1795,23 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
                "hello()\n"
                "</parameter>\n"
                "</function>\n"
-               "</tool_call>"
-            )
+               "</tool_call>")
             .enable_thinking(true)
             .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
             .tools({
                 python_tool
         })
-            .expect_reasoning("Let's call a tool: <tool_call>\n"
-               "<function=python>\n"
-               "<parameter=code>\n"
-               "def hello():\n"
-               "    print(\"Not the real call!\")\n"
-               "\n"
-               "hello()\n"
-               "</parameter>\n"
-               "</function>\n"
-               "</tool_call>")
+            .expect_reasoning(
+                "Let's call a tool: <tool_call>\n"
+                "<function=python>\n"
+                "<parameter=code>\n"
+                "def hello():\n"
+                "    print(\"Not the real call!\")\n"
+                "\n"
+                "hello()\n"
+                "</parameter>\n"
+                "</function>\n"
+                "</tool_call>")
             .expect_tool_calls({
                 { "python", "{\"code\": \"def hello():\\n    print(\\\"Hello, world!\\\")\\n\\nhello()\"}", {} },
             })
@@ -1800,6 +1840,219 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
             .tools({ empty_args_tool_no_properties })
             .expect(message_with_tool_calls("empty_args_no_props", "{}"))
             .run();
+
+        // Edge cases when reasoning traces are not sent
+        tst.test(
+               "<think>\n\n</think>\n\n"
+               "<tool_call>\n"
+               "<function=special_function>\n"
+               "<parameter=arg1>\n1\n</parameter>\n"
+               "</function>\n"
+               "</tool_call>")
+            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+            .tools({
+                special_function_tool
+        })
+            .expect_reasoning("<think>\n\n")
+            .expect_tool_calls({ { "special_function", "{\"arg1\": 1}", "" } })
+            .run();
+
+        tst.test(
+               "</think>\n\n"
+               "<tool_call>\n"
+               "<function=special_function>\n"
+               "<parameter=arg1>\n1\n</parameter>\n"
+               "</function>\n"
+               "</tool_call>")
+            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+            .tools({
+                special_function_tool
+        })
+            .expect_reasoning("")
+            .expect_tool_calls({ { "special_function", "{\"arg1\": 1}", "" } })
+            .run();
+
+        tst.test(
+               "</think>\n\n"
+               "<tool_call>\n"
+               "<function=run_in_terminal>\n"
+               "<parameter=command>\n"
+               "pwd\n"
+               "</parameter>\n"
+               "</function>\n"
+               "</tool_call>")
+            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+            .enable_thinking(true)
+            .tools({
+                run_in_terminal_tool
+        })
+            .expect_tool_calls({
+                { "run_in_terminal", R"({"command": "pwd"})", {} },
+            })
+            .run();
+
+        tst.test(
+               "</think>\n\n"
+               "Let me inspect the current directory.\n"
+               "<tool_call>\n"
+               "<function=run_in_terminal>\n"
+               "<parameter=command>\n"
+               "pwd\n"
+               "</parameter>\n"
+               "</function>\n"
+               "</tool_call>")
+            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+            .enable_thinking(true)
+            .tools({
+                run_in_terminal_tool
+        })
+            .expect_content("Let me inspect the current directory.\n")
+            .expect_tool_calls({
+                { "run_in_terminal", R"({"command": "pwd"})", {} },
+            })
+            .run();
+
+        tst.test(
+               "</think>\n\n"
+               "Let me inspect the current directory.\n"
+               "<tool_call>\n"
+               "<function=run_in_terminal>\n"
+               "<parameter=command>\n"
+               "pwd\n"
+               "</parameter>\n"
+               "</function>\n"
+               "</tool_call>")
+            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+            .enable_thinking(true)
+            .tools({
+                run_in_terminal_tool
+        })
+            .tool_choice(COMMON_CHAT_TOOL_CHOICE_REQUIRED)
+            .expect_content("Let me inspect the current directory.\n")
+            .expect_tool_calls({
+                { "run_in_terminal", R"({"command": "pwd"})", {} },
+            })
+            .run();
+
+        tst.test(
+               "I should inspect the directory.\n"
+               "</think>\n\n"
+               "Let me inspect it now.\n"
+               "<tool_call>\n"
+               "<function=run_in_terminal>\n"
+               "<parameter=command>\n"
+               "pwd\n"
+               "</parameter>\n"
+               "</function>\n"
+               "</tool_call>")
+            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+            .enable_thinking(true)
+            .tools({
+                run_in_terminal_tool
+        })
+            .expect_reasoning("I should inspect the directory.")
+            .expect_content("Let me inspect it now.\n")
+            .expect_tool_calls({
+                { "run_in_terminal", R"({"command": "pwd"})", {} },
+            })
+            .run();
+
+        tst.test(
+               "I might call <tool_call> later, but I am still thinking.\n"
+               "</think>\n\n"
+               "Final answer without tools.")
+            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+            .enable_thinking(true)
+            .tools({ run_in_terminal_tool })
+            .expect_reasoning("I might call <tool_call> later, but I am still thinking.")
+            .expect_content("Final answer without tools.")
+            .run();
+
+        {
+            common_chat_msg user_start;
+            user_start.role    = "user";
+            user_start.content = "Create a todo list, then inspect the repository.";
+
+            common_chat_msg assistant_todos =
+                simple_assist_msg("", "", "manage_todo_list",
+                                  R"({"todos":[{"item":"Inspect repository","selected":false}]})", "call_todos");
+
+            common_chat_msg tool_result;
+            tool_result.role         = "tool";
+            tool_result.content      = "Successfully wrote todo list";
+            tool_result.tool_call_id = "call_todos";
+
+            common_chat_msg user_continue;
+            user_continue.role    = "user";
+            user_continue.content = "Proceed.";
+
+            tst.test(
+                   "I need to run a terminal command.\n"
+                   "</think>\n\n"
+                   "<tool_call>\n"
+                   "<function=run_in_terminal>\n"
+                   "<parameter=command>\n"
+                   "pwd\n"
+                   "</parameter>\n"
+                   "</function>\n"
+                   "</tool_call>")
+                .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+                .enable_thinking(true)
+                .tools({
+                    manage_todo_list_tool, run_in_terminal_tool
+            })
+                .messages({ user_start, assistant_todos, tool_result, user_continue })
+                .expect_reasoning("I need to run a terminal command.")
+                .expect_tool_calls({
+                    { "run_in_terminal", R"({"command": "pwd"})", {} },
+                })
+                .run();
+
+            tst.test(
+                   "I need to run a terminal command.\n"
+                   "</think>\n\n"
+                   "Let me inspect the current directory.\n"
+                   "<tool_call>\n"
+                   "<function=run_in_terminal>\n"
+                   "<parameter=command>\n"
+                   "pwd\n"
+                   "</parameter>\n"
+                   "</function>\n"
+                   "</tool_call>")
+                .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+                .enable_thinking(true)
+                .tools({
+                    manage_todo_list_tool, run_in_terminal_tool
+            })
+                .tool_choice(COMMON_CHAT_TOOL_CHOICE_REQUIRED)
+                .messages({ user_start, assistant_todos, tool_result, user_continue })
+                .expect_reasoning("I need to run a terminal command.")
+                .expect_content("Let me inspect the current directory.\n")
+                .expect_tool_calls({
+                    { "run_in_terminal", R"({"command": "pwd"})", {} },
+                })
+                .run();
+
+            tst.test(
+                   "</think>\n\n"
+                   "<tool_call>\n"
+                   "<function=run_in_terminal>\n"
+                   "<parameter=command>\n"
+                   "pwd\n"
+                   "</parameter>\n"
+                   "</function>\n"
+                   "</tool_call>")
+                .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+                .enable_thinking(true)
+                .tools({
+                    manage_todo_list_tool, run_in_terminal_tool
+            })
+                .messages({ user_start, assistant_todos, tool_result, user_continue })
+                .expect_tool_calls({
+                    { "run_in_terminal", R"({"command": "pwd"})", {} },
+                })
+                .run();
+        }
     }
 
     {
diff --git a/tests/test-reasoning-budget.cpp b/tests/test-reasoning-budget.cpp
index f7a601789966..747d246448f8 100644
--- a/tests/test-reasoning-budget.cpp
+++ b/tests/test-reasoning-budget.cpp
@@ -70,20 +70,20 @@ static void test_reasoning_budget(
         llama_sampler_apply(sampler, &cur_p);
 
         // Check if forcing is active (all logits except one should be -INFINITY)
-        size_t finite_count = 0;
-        llama_token finite_token = -1;
+        size_t not_neg_inf = 0;
+        llama_token not_neg_inf_token = -1;
         for (size_t j = 0; j < cur.size(); j++) {
-            if (std::isfinite(cur[j].logit)) {
-                finite_count++;
-                finite_token = cur[j].id;
+            if (std::isfinite(cur[j].logit) || cur[j].logit > 0) { // +INFINITY
+                not_neg_inf++;
+                not_neg_inf_token = cur[j].id;
             }
         }
 
         llama_sampler_accept(sampler, sequence[i]);
 
-        fprintf(stderr, "    i=%zu: token=%d, finite_count=%zu, finite_token=%d\n", i, (int)sequence[i], finite_count, (int)finite_token);
+        fprintf(stderr, "    i=%zu: token=%d, not_neg_inf_count=%zu, not_neg_inf_token=%d\n", i, (int)sequence[i], not_neg_inf, (int)not_neg_inf_token);
 
-        if (finite_count == 1) {
+        if (not_neg_inf == 1) {
             if (actual_force_start == SIZE_MAX) {
                 actual_force_start = i;
             }
diff --git a/tools/server/tests/unit/test_tool_call.py b/tools/server/tests/unit/test_tool_call.py
index b1a5ab9da4e0..9fa84d165efc 100755
--- a/tools/server/tests/unit/test_tool_call.py
+++ b/tools/server/tests/unit/test_tool_call.py
@@ -126,69 +126,70 @@ def do_test_completion_with_required_tool_tiny(server: ServerProcess, tool: dict
             actual_arguments = json.loads(actual_arguments)
         assert argument_key in actual_arguments, f"tool arguments: {actual_arguments}, expected: {argument_key}"
 
-
-@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
-@pytest.mark.parametrize("template_name,tool,argument_key", [
-    ("Qwen3-Coder",                                   TEST_TOOL,            "success"),
-    ("Qwen3-Coder",                                   TEST_TOOL,            "success"),
-    ("meta-llama-Llama-3.3-70B-Instruct",             TEST_TOOL,            "success"),
-    ("meta-llama-Llama-3.3-70B-Instruct",             TEST_TOOL,            "success"),
-    ("meta-llama-Llama-3.3-70B-Instruct",             PYTHON_TOOL,          "code"),
-    ("meta-llama-Llama-3.3-70B-Instruct",             PYTHON_TOOL,          "code"),
-])
-def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict, argument_key: str | None, stream: CompletionMode):
-    global server
-    n_predict = 1024
-    # server = ServerPreset.stories15m_moe()
-    server.jinja = True
-    server.n_predict = n_predict
-    server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
-    server.start()
-    do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, stream=stream == CompletionMode.STREAMED, temperature=0.0, top_k=1, top_p=1.0)
-
-
-@pytest.mark.slow
-@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
-@pytest.mark.parametrize("template_name,tool,argument_key", [
-    ("meta-llama-Llama-3.1-8B-Instruct",              TEST_TOOL,            "success"),
-    ("meta-llama-Llama-3.1-8B-Instruct",              PYTHON_TOOL,          "code"),
-
-    ("meetkai-functionary-medium-v3.1",               TEST_TOOL,            "success"),
-    ("meetkai-functionary-medium-v3.1",               PYTHON_TOOL,          "code"),
-
-    ("meetkai-functionary-medium-v3.2",               TEST_TOOL,            "success"),
-    # Functionary v3.2 format supports raw python content, which w/ a dummy stories model will never end on its own.
-    # ("meetkai-functionary-medium-v3.2",               PYTHON_TOOL,          "code"),
-
-    ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", TEST_TOOL,            "success"),
-    ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", PYTHON_TOOL,          "code"),
-
-    ("meta-llama-Llama-3.2-3B-Instruct",              TEST_TOOL,            "success"),
-    ("meta-llama-Llama-3.2-3B-Instruct",              PYTHON_TOOL,          "code"),
-
-    ("mistralai-Mistral-Nemo-Instruct-2407",          TEST_TOOL,            "success"),
-    ("mistralai-Mistral-Nemo-Instruct-2407",          PYTHON_TOOL,          "code"),
-
-    ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use",   TEST_TOOL,            "success"),
-    ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use",   PYTHON_TOOL,          "code"),
-
-    ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B",      TEST_TOOL,            "success"),
-    ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B",      PYTHON_TOOL,          "code"),
-
-    ("fireworks-ai-llama-3-firefunction-v2",          TEST_TOOL,            "success"),
-    # ("fireworks-ai-llama-3-firefunction-v2",          PYTHON_TOOL,          "codeFalse), True),
-    # ("fireworks-ai-llama-3-firefunction-v2",          PYTHON_TOOL,          "code"),
-
-])
-def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, argument_key: str | None, stream: CompletionMode):
-    global server
-    n_predict = 512
-    # server = ServerPreset.stories15m_moe()
-    server.jinja = True
-    server.n_predict = n_predict
-    server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
-    server.start(timeout_seconds=TIMEOUT_START_SLOW)
-    do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, stream=stream == CompletionMode.STREAMED)
+# PR #22654: commented out since we're now allowing content before tool calls in tool_call: required, so we can't force this
+# in the tiny model just by using the grammar
+#
+# @pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
+# @pytest.mark.parametrize("template_name,tool,argument_key", [
+#     ("Qwen3-Coder",                                   TEST_TOOL,            "success"),
+#     ("Qwen3-Coder",                                   TEST_TOOL,            "success"),
+#     ("meta-llama-Llama-3.3-70B-Instruct",             TEST_TOOL,            "success"),
+#     ("meta-llama-Llama-3.3-70B-Instruct",             TEST_TOOL,            "success"),
+#     ("meta-llama-Llama-3.3-70B-Instruct",             PYTHON_TOOL,          "code"),
+#     ("meta-llama-Llama-3.3-70B-Instruct",             PYTHON_TOOL,          "code"),
+# ])
+# def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict, argument_key: str | None, stream: CompletionMode):
+#     global server
+#     n_predict = 1024
+#     # server = ServerPreset.stories15m_moe()
+#     server.jinja = True
+#     server.n_predict = n_predict
+#     server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
+#     server.start()
+#     do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, stream=stream == CompletionMode.STREAMED, temperature=0.0, top_k=1, top_p=1.0)
+
+# @pytest.mark.slow
+# @pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
+# @pytest.mark.parametrize("template_name,tool,argument_key", [
+#     ("meta-llama-Llama-3.1-8B-Instruct",              TEST_TOOL,            "success"),
+#     ("meta-llama-Llama-3.1-8B-Instruct",              PYTHON_TOOL,          "code"),
+
+#     ("meetkai-functionary-medium-v3.1",               TEST_TOOL,            "success"),
+#     ("meetkai-functionary-medium-v3.1",               PYTHON_TOOL,          "code"),
+
+#     ("meetkai-functionary-medium-v3.2",               TEST_TOOL,            "success"),
+#     # Functionary v3.2 format supports raw python content, which w/ a dummy stories model will never end on its own.
+#     # ("meetkai-functionary-medium-v3.2",               PYTHON_TOOL,          "code"),
+
+#     ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", TEST_TOOL,            "success"),
+#     ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", PYTHON_TOOL,          "code"),
+
+#     ("meta-llama-Llama-3.2-3B-Instruct",              TEST_TOOL,            "success"),
+#     ("meta-llama-Llama-3.2-3B-Instruct",              PYTHON_TOOL,          "code"),
+
+#     ("mistralai-Mistral-Nemo-Instruct-2407",          TEST_TOOL,            "success"),
+#     ("mistralai-Mistral-Nemo-Instruct-2407",          PYTHON_TOOL,          "code"),
+
+#     ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use",   TEST_TOOL,            "success"),
+#     ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use",   PYTHON_TOOL,          "code"),
+
+#     ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B",      TEST_TOOL,            "success"),
+#     ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B",      PYTHON_TOOL,          "code"),
+
+#     ("fireworks-ai-llama-3-firefunction-v2",          TEST_TOOL,            "success"),
+#     # ("fireworks-ai-llama-3-firefunction-v2",          PYTHON_TOOL,          "codeFalse), True),
+#     # ("fireworks-ai-llama-3-firefunction-v2",          PYTHON_TOOL,          "code"),
+
+# ])
+# def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, argument_key: str | None, stream: CompletionMode):
+#     global server
+#     n_predict = 512
+#     # server = ServerPreset.stories15m_moe()
+#     server.jinja = True
+#     server.n_predict = n_predict
+#     server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
+#     server.start(timeout_seconds=TIMEOUT_START_SLOW)
+#     do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, stream=stream == CompletionMode.STREAMED)
 
 
 @pytest.mark.slow

From 36a694c96552a0c5299f297e39de60745db5a073 Mon Sep 17 00:00:00 2001
From: JusteLeo <leonard.adamo66@gmail.com>
Date: Mon, 4 May 2026 13:38:10 +0200
Subject: [PATCH 09/13] webui : fix circular dependency between chat.service.ts
 and models.svelte.ts (#22625)

---
 tools/server/webui/src/lib/stores/models.svelte.ts | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/server/webui/src/lib/stores/models.svelte.ts b/tools/server/webui/src/lib/stores/models.svelte.ts
index d7c885844fc8..00748c4c2a39 100644
--- a/tools/server/webui/src/lib/stores/models.svelte.ts
+++ b/tools/server/webui/src/lib/stores/models.svelte.ts
@@ -1,7 +1,8 @@
 import { SvelteMap, SvelteSet } from 'svelte/reactivity';
 import { toast } from 'svelte-sonner';
 import { ServerModelStatus, ModelModality } from '$lib/enums';
-import { ModelsService, PropsService } from '$lib/services';
+import { ModelsService } from '$lib/services/models.service';
+import { PropsService } from '$lib/services/props.service';
 import { serverStore } from '$lib/stores/server.svelte';
 import { TTLCache } from '$lib/utils';
 import {

From d8794eecd582b35ab5540e748dba057fc48ebe8b Mon Sep 17 00:00:00 2001
From: Shakhnazar Sailaukan <101112128+Sailaukan@users.noreply.github.com>
Date: Mon, 4 May 2026 16:19:30 +0400
Subject: [PATCH 10/13] examples: refactor diffusion generation (#22590)

* examples: refactor diffusion generation

* renamed enum values
---
 common/arg.cpp                       |   5 +-
 examples/diffusion/CMakeLists.txt    |   7 +-
 examples/diffusion/README.md         |  10 +-
 examples/diffusion/diffusion-cli.cpp | 462 +--------------------------
 examples/diffusion/diffusion.cpp     | 408 +++++++++++++++++++++++
 examples/diffusion/diffusion.h       |  57 ++++
 6 files changed, 496 insertions(+), 453 deletions(-)
 create mode 100644 examples/diffusion/diffusion.cpp
 create mode 100644 examples/diffusion/diffusion.h

diff --git a/common/arg.cpp b/common/arg.cpp
index 8f54ee38c1b8..bd1a745e6af5 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -3794,7 +3794,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
     add_opt(common_arg(
         {"--diffusion-algorithm"}, "N",
-        string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)", params.diffusion.algorithm),
+        string_format(
+            "diffusion algorithm: 0=DIFFUSION_ALGORITHM_ORIGIN, 1=DIFFUSION_ALGORITHM_ENTROPY_BASED, "
+            "2=DIFFUSION_ALGORITHM_MARGIN_BASED, 3=DIFFUSION_ALGORITHM_RANDOM, "
+            "4=DIFFUSION_ALGORITHM_CONFIDENCE_BASED (default: %d)", params.diffusion.algorithm),
         [](common_params & params, int value) { params.diffusion.algorithm = value; }
     ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
     add_opt(common_arg(
diff --git a/examples/diffusion/CMakeLists.txt b/examples/diffusion/CMakeLists.txt
index 70228d4079b3..42a84b2dfe5f 100644
--- a/examples/diffusion/CMakeLists.txt
+++ b/examples/diffusion/CMakeLists.txt
@@ -1,5 +1,10 @@
+set(TARGET llama-diffusion)
+add_library(${TARGET} STATIC diffusion.cpp diffusion.h)
+target_link_libraries(${TARGET} PUBLIC llama llama-common ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PUBLIC cxx_std_17)
+
 set(TARGET llama-diffusion-cli)
 add_executable(${TARGET} diffusion-cli.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama llama-common ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-diffusion llama llama-common ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/diffusion/README.md b/examples/diffusion/README.md
index b3942002147a..6d2fffd64e34 100644
--- a/examples/diffusion/README.md
+++ b/examples/diffusion/README.md
@@ -12,11 +12,11 @@ The diffusion CLI supports various parameters to control the generation process:
 ### Core Diffusion Parameters
 - `--diffusion-steps`: Number of diffusion steps (default: 256)
 - `--diffusion-algorithm`: Algorithm for token selection
-  - `0`: ORIGIN - Token will be generated in a purely random order from https://arxiv.org/abs/2107.03006.
-  - `1`: ENTROPY_BASED - Entropy-based selection
-  - `2`: MARGIN_BASED - Margin-based selection
-  - `3`: RANDOM - Random selection
-  - `4`: CONFIDENCE_BASED - Confidence-based selection (default)
+  - `0`: DIFFUSION_ALGORITHM_ORIGIN - Token will be generated in a purely random order from https://arxiv.org/abs/2107.03006.
+  - `1`: DIFFUSION_ALGORITHM_ENTROPY_BASED - Entropy-based selection
+  - `2`: DIFFUSION_ALGORITHM_MARGIN_BASED - Margin-based selection
+  - `3`: DIFFUSION_ALGORITHM_RANDOM - Random selection
+  - `4`: DIFFUSION_ALGORITHM_CONFIDENCE_BASED - Confidence-based selection (default)
   - More documentation here https://github.com/DreamLM/Dream
 - `--diffusion-visual`: Enable live visualization during generation
 
diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp
index 403b9b474454..86ebbf88c98d 100644
--- a/examples/diffusion/diffusion-cli.cpp
+++ b/examples/diffusion/diffusion-cli.cpp
@@ -1,127 +1,23 @@
 #include "arg.h"
 #include "chat.h"
 #include "common.h"
+#include "diffusion.h"
 #include "llama.h"
 #include "log.h"
 
 #include <limits.h>
 
-#include <algorithm>
 #include <clocale>
-#include <cmath>
 #include <cstring>
-#include <limits>
-#include <random>
 #include <string>
 #include <vector>
 
-enum diffusion_algorithm { ORIGIN = 0, ENTROPY_BASED = 1, MARGIN_BASED = 2, RANDOM = 3, CONFIDENCE_BASED = 4 };
-
-// Unified transfer scheduling methods
-enum transfer_schedule {
-    TIMESTEP_BASED = 0,  // Dream-style: (1.0 - s/t) * remaining
-    BLOCK_BASED    = 1,  // LLaDA-style: process in blocks with get_num_transfer_tokens
-};
-
-typedef bool (*diffusion_step_callback_t)(int32_t             step,
-                                          int32_t             total_steps,
-                                          const llama_token * tokens,
-                                          int32_t             n_tokens,
-                                          void *              user_data);
-
-struct diffusion_params {
-    int32_t                   steps                   = 0;
-    float                     temperature             = 0;
-    llama_token               mask_token_id           = LLAMA_TOKEN_NULL;
-    diffusion_step_callback_t step_callback           = nullptr;
-    void *                    step_callback_user_data = nullptr;
-    int32_t                   seed                    = 0;
-    bool                      visual_mode             = false;
-    bool                      shift_logits            = false;  // Shift logits by -1 after decode
-
-    float   top_p = 0.;
-    int32_t top_k = 0.;
-
-    diffusion_algorithm algorithm = CONFIDENCE_BASED;
-    transfer_schedule   schedule  = TIMESTEP_BASED;
-
-    float   cfg_scale        = 0.;     // Config scale for classifier-free guidance
-    float   eps              = 0.;     // Timestep scheduling
-    int32_t block_length     = 0;      // Block size (for block scheduling)
-    float   alg_temp         = 0;      // algorithm temperature (0.0 = deterministic)
-    bool    add_gumbel_noise = false;  // Add gumbel noise to the logits if temp > 0.0
-
-    int32_t max_length = 0;            // Maximum sequence length
-};
-
 struct callback_data {
     diffusion_params *  diff_params;
     const llama_vocab * vocab;
     int32_t             n_input;
 };
 
-static float calculate_confidence(const llama_token_data_array & cur_p,
-                                  diffusion_algorithm            algorithm,
-                                  std::mt19937 &                 rng) {
-    switch (algorithm) {
-        case CONFIDENCE_BASED:
-            return cur_p.data[cur_p.selected].p;  // Selected token probability
-
-        case ENTROPY_BASED:
-            {
-                float       entropy = 0.0f;
-                const float epsilon = 1e-10f;
-                for (size_t i = 0; i < cur_p.size; i++) {
-                    float prob = cur_p.data[i].p;
-                    entropy += prob * logf(prob + epsilon);
-                }
-                return -entropy;  // Higher entropy = lower confidence
-            }
-
-        case MARGIN_BASED:
-            return (cur_p.size > 1) ? cur_p.data[0].p - cur_p.data[1].p : cur_p.data[0].p;
-
-        case RANDOM:
-            {
-                std::uniform_real_distribution<float> uniform(0.0f, 1.0f);
-                return uniform(rng);  // Random confidence
-            }
-
-        case ORIGIN:
-            return cur_p.data[cur_p.selected].p;
-
-        default:
-            return 0.0f;
-    }
-}
-
-// Unified transfer count calculation function
-static int32_t calculate_transfer_count(int32_t                      step,
-                                        int32_t                      total_steps,
-                                        int32_t                      remaining_masked,
-                                        transfer_schedule            schedule,
-                                        float                        eps,
-                                        const std::vector<int32_t> & num_transfer_tokens = {}) {
-    switch (schedule) {
-        case TIMESTEP_BASED:
-            {
-                float t          = 1.0f - (float) step / total_steps * (1.0f - eps);
-                float s          = 1.0f - (float) (step + 1) / total_steps * (1.0f - eps);
-                float p_transfer = (step < total_steps - 1) ? (1.0f - s / t) : 1.0f;
-                return (int32_t) (remaining_masked * p_transfer);
-            }
-
-        case BLOCK_BASED:
-            if (!num_transfer_tokens.empty() && step < (int32_t) num_transfer_tokens.size()) {
-                return num_transfer_tokens[step];
-            }
-            return remaining_masked / (total_steps - step);  // Fallback
-
-        default:
-            return remaining_masked / (total_steps - step);
-    }
-}
-
 static bool diffusion_step_callback(int32_t             step,
                                     int32_t             total_steps,
                                     const llama_token * tokens,
@@ -176,341 +72,6 @@ static bool diffusion_step_callback(int32_t             step,
     return true;
 }
 
-static void add_gumbel_noise(float * logits, int32_t n_vocab, float temperature, std::mt19937 & rng) {
-    if (temperature == 0.0f) {
-        return;
-    }
-
-    std::uniform_real_distribution<double> uniform(0.0, 1.0);
-    for (int32_t i = 0; i < n_vocab; i++) {
-        double noise        = uniform(rng);
-        // Prevent log(0)
-        noise               = std::max(noise, 1e-20);
-        double gumbel_noise = std::pow(-std::log(noise), temperature);
-        logits[i]           = std::exp(logits[i]) / gumbel_noise;
-    }
-}
-
-static std::vector<int32_t> get_num_transfer_tokens(int32_t mask_count, int32_t steps) {
-    std::vector<int32_t> num_transfer_tokens(steps);
-
-    int32_t base      = mask_count / steps;
-    int32_t remainder = mask_count % steps;
-
-    for (int32_t i = 0; i < steps; i++) {
-        num_transfer_tokens[i] = base + (i < remainder ? 1 : 0);
-    }
-
-    return num_transfer_tokens;
-}
-
-static void diffusion_generate(llama_context *          ctx,
-                               const llama_token *      input_tokens,
-                               llama_token *            output_tokens,
-                               int32_t                  n_input,
-                               const diffusion_params & params,
-                               int32_t &                n_generated) {
-    n_generated = 0;
-    if (!ctx || !input_tokens || !output_tokens || n_input <= 0 || params.max_length <= n_input) {
-        return;
-    }
-
-    const llama_model * model = llama_get_model(ctx);
-
-    // Initialize with input and pad with mask tokens
-    std::copy(input_tokens, input_tokens + n_input, output_tokens);
-    std::fill(output_tokens + n_input, output_tokens + params.max_length, params.mask_token_id);
-
-    std::mt19937 rng(params.seed);
-
-    llama_set_causal_attn(ctx, false);
-
-    int32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
-
-    std::vector<llama_token_data> candidates(n_vocab);
-    std::vector<llama_token_data> conf_candidates;
-    conf_candidates.reserve(params.max_length);
-    std::vector<int32_t> mask_positions;
-    mask_positions.reserve(params.max_length);
-
-    // Setup sampler chain
-    struct llama_sampler * sampler = llama_sampler_chain_init(llama_sampler_chain_default_params());
-    if (params.top_k > 0) {
-        llama_sampler_chain_add(sampler, llama_sampler_init_top_k(params.top_k));
-    }
-    if (params.top_p < 1.0f) {
-        llama_sampler_chain_add(sampler, llama_sampler_init_top_p(params.top_p, 1));
-    }
-    if (params.temperature > 0.0f) {
-        llama_sampler_chain_add(sampler, llama_sampler_init_temp(params.temperature));
-    }
-    llama_sampler_chain_add(sampler, llama_sampler_init_dist(params.seed));
-
-    struct llama_sampler * dist_sampler = llama_sampler_init_dist(params.seed);
-
-    llama_batch batch = llama_batch_init(params.max_length, 0, 1);
-    batch.n_tokens    = params.max_length;
-
-    // Pre-allocate buffers for CFG if needed
-    int32_t                  logits_size = n_vocab * params.max_length;
-    std::vector<float>       cond_logits_buffer;
-    std::vector<llama_token> un_x_buffer;
-    if (params.cfg_scale > 0.0f) {
-        cond_logits_buffer.resize(logits_size);
-        un_x_buffer.resize(params.max_length);
-    }
-
-    // For block-based processing
-    std::vector<int32_t> num_transfer_tokens;
-    int32_t              num_blocks      = 1;
-    int32_t              steps_per_block = params.steps;
-
-    if (params.schedule == BLOCK_BASED) {
-        GGML_ASSERT(params.max_length % params.block_length == 0);
-        num_blocks = params.max_length / params.block_length;
-        GGML_ASSERT(params.steps % num_blocks == 0);
-        steps_per_block = params.steps / num_blocks;
-    }
-
-    std::vector<float> confidence(params.max_length);
-
-    int64_t total_sampling_time = 0;
-    int64_t total_time          = 0;
-    int64_t time_start          = ggml_time_us();
-
-    for (int block_num = 0; block_num < num_blocks; block_num++) {
-        int32_t block_start = (params.schedule == BLOCK_BASED) ? n_input + block_num * params.block_length : 0;
-        int32_t block_end   = (params.schedule == BLOCK_BASED) ?
-                                  std::min(n_input + (block_num + 1) * params.block_length, params.max_length) :
-                                  params.max_length;
-
-        // Count masked tokens in current block for block-based processing
-        if (params.schedule == BLOCK_BASED) {
-            int32_t block_mask_count = 0;
-            for (int i = block_start; i < block_end; i++) {
-                if (output_tokens[i] == params.mask_token_id) {
-                    block_mask_count++;
-                }
-            }
-            num_transfer_tokens = get_num_transfer_tokens(block_mask_count, steps_per_block);
-        }
-
-        for (int32_t step = 0; step < steps_per_block; step++) {
-            int32_t global_step = block_num * steps_per_block + step;
-
-            if (params.step_callback) {
-                if (!params.step_callback(
-                        global_step, params.steps, output_tokens, params.max_length, params.step_callback_user_data)) {
-                    break;
-                }
-            }
-
-            // Setup batch
-            for (int32_t i = 0; i < params.max_length; i++) {
-                batch.token[i]     = output_tokens[i];
-                batch.pos[i]       = i;
-                batch.n_seq_id[i]  = 1;
-                batch.seq_id[i][0] = 0;
-                batch.logits[i]    = 1;
-            }
-
-            float * logits = nullptr;
-
-            if (params.cfg_scale > 0.0f) {
-                int ret = llama_decode(ctx, batch);
-                if (ret != 0) {
-                    LOG_ERR("Failed to generate conditional");
-                    break;
-                }
-                float * cond_logits_ptr = llama_get_logits(ctx);
-                std::memcpy(cond_logits_buffer.data(), cond_logits_ptr, logits_size * sizeof(float));
-
-                // Unconditional generation (mask input)
-                std::copy(output_tokens, output_tokens + params.max_length, un_x_buffer.begin());
-                for (int32_t i = 0; i < n_input; i++) {
-                    un_x_buffer[i] = params.mask_token_id;
-                }
-
-                for (int32_t i = 0; i < params.max_length; i++) {
-                    batch.token[i] = un_x_buffer[i];
-                }
-                ret = llama_decode(ctx, batch);
-                if (ret != 0) {
-                    LOG_ERR("Failed to generate unconditional");
-                    break;
-                }
-                float * uncond_logits = llama_get_logits(ctx);
-
-                // Apply CFG
-                for (int32_t i = 0; i < logits_size; i++) {
-                    cond_logits_buffer[i] =
-                        uncond_logits[i] + (params.cfg_scale + 1.0f) * (cond_logits_buffer[i] - uncond_logits[i]);
-                }
-                logits = cond_logits_buffer.data();
-            } else {
-                int ret = llama_decode(ctx, batch);
-                if (ret != 0) {
-                    LOG_ERR("%s: failed to decode at step %d, ret = %d\n", __func__, global_step, ret);
-                    break;
-                }
-                logits = llama_get_logits(ctx);
-            }
-
-            if (!logits) {
-                LOG_ERR("%s: failed to get logits at step %d\n", __func__, global_step);
-                break;
-            }
-
-            auto get_logits_for_pos = [&](int32_t pos) -> const float * {
-                if (params.shift_logits) {
-                    return pos == 0 ? logits : logits + (pos - 1) * n_vocab;
-                }
-                return logits + (pos) *n_vocab;
-            };
-
-            int64_t time_start_sampling = ggml_time_us();
-
-            mask_positions.clear();
-            for (int32_t i = 0; i < params.max_length; i++) {
-                if (output_tokens[i] == params.mask_token_id) {
-                    // For block-based, only consider current block
-                    if (params.schedule != BLOCK_BASED || (i >= block_start && i < block_end)) {
-                        mask_positions.push_back(i);
-                    }
-                }
-            }
-
-            if (mask_positions.empty()) {
-                break;
-            }
-
-            if (params.add_gumbel_noise && params.temperature > 0.0f) {
-                add_gumbel_noise(logits, n_vocab, params.temperature, rng);
-            }
-
-            if (params.algorithm == ORIGIN) {
-                int32_t transfer_count = calculate_transfer_count(
-                    step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens);
-                float p_transfer = (float) transfer_count / mask_positions.size();
-
-                for (int32_t pos : mask_positions) {
-                    if (std::uniform_real_distribution<float>(0.0f, 1.0f)(rng) < p_transfer) {
-                        const float * pos_logits = get_logits_for_pos(pos);
-                        for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
-                            candidates[token_id].id    = token_id;
-                            candidates[token_id].logit = pos_logits[token_id];
-                            candidates[token_id].p     = 0.0f;
-                        }
-
-                        llama_token_data_array cur_p = {
-                            candidates.data(),
-                            (size_t) n_vocab,
-                            -1,
-                            false,
-                        };
-
-                        llama_sampler_apply(sampler, &cur_p);
-                        output_tokens[pos] = cur_p.data[cur_p.selected].id;
-                    }
-                }
-            } else {
-                std::vector<std::pair<float, int32_t>> confidences;
-                std::vector<llama_token>               sampled_tokens(mask_positions.size());
-
-                for (size_t i = 0; i < mask_positions.size(); i++) {
-                    int32_t       pos        = mask_positions[i];
-                    const float * pos_logits = get_logits_for_pos(pos);
-
-                    for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
-                        candidates[token_id].logit = pos_logits[token_id];
-                        candidates[token_id].p     = 0.0f;
-                        candidates[token_id].id    = token_id;
-                    }
-
-                    llama_token_data_array cur_p = {
-                        candidates.data(),
-                        candidates.size(),
-                        -1,
-                        false,
-                    };
-
-                    llama_sampler_apply(sampler, &cur_p);
-                    llama_token sampled_token = cur_p.data[cur_p.selected].id;
-
-                    float conf = calculate_confidence(cur_p, params.algorithm, rng);
-
-                    sampled_tokens[i] = sampled_token;
-                    confidences.emplace_back(conf, i);
-                }
-
-                int32_t transfer_count = calculate_transfer_count(
-                    step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens);
-
-                if (transfer_count > 0) {
-                    if (params.alg_temp == 0.0f) {
-                        std::partial_sort(confidences.begin(),
-                                          confidences.begin() + std::min(transfer_count, (int32_t) confidences.size()),
-                                          confidences.end(),
-                                          [](const std::pair<float, int32_t> & a, const std::pair<float, int32_t> & b) {
-                                              if (a.first != b.first) {
-                                                  return a.first > b.first;
-                                              }
-                                              return a.second < b.second;
-                                          });
-
-                        for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) {
-                            int32_t mask_idx   = confidences[i].second;
-                            int32_t pos        = mask_positions[mask_idx];
-                            output_tokens[pos] = sampled_tokens[mask_idx];
-                        }
-                    } else {
-                        conf_candidates.clear();
-                        for (size_t i = 0; i < confidences.size(); i++) {
-                            float conf_logit = confidences[i].first / params.alg_temp;
-                            conf_candidates.emplace_back(llama_token_data{ (int32_t) i, conf_logit, 0.0f });
-                        }
-
-                        llama_token_data_array conf_array = {
-                            conf_candidates.data(),
-                            conf_candidates.size(),
-                            -1,
-                            false,
-                        };
-
-                        for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) {
-                            llama_sampler_apply(dist_sampler, &conf_array);
-                            int32_t selected_idx = conf_array.selected;
-                            int32_t mask_idx     = selected_idx;
-                            int32_t pos          = mask_positions[mask_idx];
-                            output_tokens[pos]   = sampled_tokens[mask_idx];
-
-                            conf_candidates[selected_idx].p = 0.0f;
-                            conf_array.selected             = -1;
-                        }
-                    }
-                }
-            }
-
-            int64_t time_end_sampling = ggml_time_us();
-            total_sampling_time += time_end_sampling - time_start_sampling;
-        }
-    }
-
-    int64_t time_end = ggml_time_us();
-    total_time += time_end - time_start;
-
-    LOG_INF("\ntotal time: %0.2fms, time per step: %0.2fms, sampling time per step: %0.2fms\n",
-            total_time / 1000.0,
-            total_time / 1000.0 / params.steps,
-            total_sampling_time / 1000.0 / params.steps);
-
-    llama_batch_free(batch);
-    llama_sampler_free(sampler);
-    llama_sampler_free(dist_sampler);
-
-    n_generated = params.max_length;
-}
-
 static std::string format_input_text(const std::string & prompt, const std::string & system_prompt, bool use_chat_template, llama_model * model) {
     if (!use_chat_template) {
         return prompt;
@@ -631,10 +192,10 @@ int main(int argc, char ** argv) {
     GGML_ASSERT((params.diffusion.eps == 0) ^ (params.diffusion.block_length == 0));
 
     if (params.diffusion.eps) {
-        diff_params.schedule = TIMESTEP_BASED;
+        diff_params.schedule = DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED;
         diff_params.eps      = params.diffusion.eps;
     } else if (params.diffusion.block_length) {
-        diff_params.schedule     = BLOCK_BASED;
+        diff_params.schedule     = DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED;
         diff_params.block_length = params.diffusion.block_length;
     }
 
@@ -653,8 +214,17 @@ int main(int argc, char ** argv) {
     callback_data cb_data               = { &diff_params, vocab, n_input };
     diff_params.step_callback_user_data = &cb_data;
 
-    const char * alg_names[]   = { "ORIGIN", "ENTROPY_BASED", "MARGIN_BASED", "RANDOM", "CONFIDENCE_BASED" };
-    const char * sched_names[] = { "TIMESTEP_BASED", "BLOCK_BASED" };
+    const char * alg_names[]   = {
+        "DIFFUSION_ALGORITHM_ORIGIN",
+        "DIFFUSION_ALGORITHM_ENTROPY_BASED",
+        "DIFFUSION_ALGORITHM_MARGIN_BASED",
+        "DIFFUSION_ALGORITHM_RANDOM",
+        "DIFFUSION_ALGORITHM_CONFIDENCE_BASED",
+    };
+    const char * sched_names[] = {
+        "DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED",
+        "DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED",
+    };
     const char * alg_name =
         (diff_params.algorithm >= 0 && diff_params.algorithm <= 4) ? alg_names[diff_params.algorithm] : "UNKNOWN";
     const char * sched_name =
@@ -666,11 +236,11 @@ int main(int argc, char ** argv) {
     LOG_INF("diffusion_params: - %-25s enum             = %d (%s)\n", "algorithm", diff_params.algorithm, alg_name);
     LOG_INF("diffusion_params: - %-25s enum             = %d (%s)\n", "schedule", diff_params.schedule, sched_name);
     LOG_INF("diffusion_params: - %-25s f32              = %.3f\n", "temperature", diff_params.temperature);
-    if (diff_params.schedule == TIMESTEP_BASED) {
+    if (diff_params.schedule == DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED) {
         LOG_INF("diffusion_params: - %-25s f32              = %.6f\n", "eps", diff_params.eps);
         LOG_INF("diffusion_params: - %-25s f32              = %.3f\n", "alg_temp", diff_params.alg_temp);
     }
-    if (diff_params.schedule == BLOCK_BASED) {
+    if (diff_params.schedule == DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED) {
         LOG_INF("diffusion_params: - %-25s u32              = %d\n", "block_length", diff_params.block_length);
         LOG_INF("diffusion_params: - %-25s f32              = %.3f\n", "cfg_scale", diff_params.cfg_scale);
     }
diff --git a/examples/diffusion/diffusion.cpp b/examples/diffusion/diffusion.cpp
new file mode 100644
index 000000000000..97d6b69449e3
--- /dev/null
+++ b/examples/diffusion/diffusion.cpp
@@ -0,0 +1,408 @@
+#include "diffusion.h"
+
+#include "log.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cmath>
+#include <cstring>
+#include <random>
+#include <utility>
+#include <vector>
+
+static float calculate_confidence(const llama_token_data_array & cur_p,
+                                  diffusion_algorithm            algorithm,
+                                  std::mt19937 &                 rng) {
+    switch (algorithm) {
+        case DIFFUSION_ALGORITHM_CONFIDENCE_BASED:
+            return cur_p.data[cur_p.selected].p;  // Selected token probability
+
+        case DIFFUSION_ALGORITHM_ENTROPY_BASED:
+            {
+                float       entropy = 0.0f;
+                const float epsilon = 1e-10f;
+                for (size_t i = 0; i < cur_p.size; i++) {
+                    float prob = cur_p.data[i].p;
+                    entropy += prob * logf(prob + epsilon);
+                }
+                return -entropy;  // Higher entropy = lower confidence
+            }
+
+        case DIFFUSION_ALGORITHM_MARGIN_BASED:
+            return (cur_p.size > 1) ? cur_p.data[0].p - cur_p.data[1].p : cur_p.data[0].p;
+
+        case DIFFUSION_ALGORITHM_RANDOM:
+            {
+                std::uniform_real_distribution<float> uniform(0.0f, 1.0f);
+                return uniform(rng);  // Random confidence
+            }
+
+        case DIFFUSION_ALGORITHM_ORIGIN:
+            return cur_p.data[cur_p.selected].p;
+
+        default:
+            return 0.0f;
+    }
+}
+
+// Unified transfer count calculation function
+static int32_t calculate_transfer_count(int32_t                      step,
+                                        int32_t                      total_steps,
+                                        int32_t                      remaining_masked,
+                                        diffusion_transfer_schedule  schedule,
+                                        float                        eps,
+                                        const std::vector<int32_t> & num_transfer_tokens = {}) {
+    switch (schedule) {
+        case DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED:
+            {
+                float t          = 1.0f - (float) step / total_steps * (1.0f - eps);
+                float s          = 1.0f - (float) (step + 1) / total_steps * (1.0f - eps);
+                float p_transfer = (step < total_steps - 1) ? (1.0f - s / t) : 1.0f;
+                return (int32_t) (remaining_masked * p_transfer);
+            }
+
+        case DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED:
+            if (!num_transfer_tokens.empty() && step < (int32_t) num_transfer_tokens.size()) {
+                return num_transfer_tokens[step];
+            }
+            return remaining_masked / (total_steps - step);  // Fallback
+
+        default:
+            return remaining_masked / (total_steps - step);
+    }
+}
+
+static void add_gumbel_noise(float * logits, int32_t n_vocab, float temperature, std::mt19937 & rng) {
+    if (temperature == 0.0f) {
+        return;
+    }
+
+    std::uniform_real_distribution<double> uniform(0.0, 1.0);
+    for (int32_t i = 0; i < n_vocab; i++) {
+        double noise        = uniform(rng);
+        // Prevent log(0)
+        noise               = std::max(noise, 1e-20);
+        double gumbel_noise = std::pow(-std::log(noise), temperature);
+        logits[i]           = std::exp(logits[i]) / gumbel_noise;
+    }
+}
+
+static std::vector<int32_t> get_num_transfer_tokens(int32_t mask_count, int32_t steps) {
+    std::vector<int32_t> num_transfer_tokens(steps);
+
+    int32_t base      = mask_count / steps;
+    int32_t remainder = mask_count % steps;
+
+    for (int32_t i = 0; i < steps; i++) {
+        num_transfer_tokens[i] = base + (i < remainder ? 1 : 0);
+    }
+
+    return num_transfer_tokens;
+}
+
+void diffusion_generate(llama_context *          ctx,
+                        const llama_token *      input_tokens,
+                        llama_token *            output_tokens,
+                        int32_t                  n_input,
+                        const diffusion_params & params,
+                        int32_t &                n_generated) {
+    n_generated = 0;
+    if (!ctx || !input_tokens || !output_tokens || n_input <= 0 || params.max_length <= n_input) {
+        return;
+    }
+
+    const llama_model * model = llama_get_model(ctx);
+
+    // Initialize with input and pad with mask tokens
+    std::copy(input_tokens, input_tokens + n_input, output_tokens);
+    std::fill(output_tokens + n_input, output_tokens + params.max_length, params.mask_token_id);
+
+    std::mt19937 rng(params.seed);
+
+    llama_set_causal_attn(ctx, false);
+
+    int32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
+
+    std::vector<llama_token_data> candidates(n_vocab);
+    std::vector<llama_token_data> conf_candidates;
+    conf_candidates.reserve(params.max_length);
+    std::vector<int32_t> mask_positions;
+    mask_positions.reserve(params.max_length);
+
+    // Setup sampler chain
+    struct llama_sampler * sampler = llama_sampler_chain_init(llama_sampler_chain_default_params());
+    if (params.top_k > 0) {
+        llama_sampler_chain_add(sampler, llama_sampler_init_top_k(params.top_k));
+    }
+    if (params.top_p < 1.0f) {
+        llama_sampler_chain_add(sampler, llama_sampler_init_top_p(params.top_p, 1));
+    }
+    if (params.temperature > 0.0f) {
+        llama_sampler_chain_add(sampler, llama_sampler_init_temp(params.temperature));
+    }
+    llama_sampler_chain_add(sampler, llama_sampler_init_dist(params.seed));
+
+    struct llama_sampler * dist_sampler = llama_sampler_init_dist(params.seed);
+
+    llama_batch batch = llama_batch_init(params.max_length, 0, 1);
+    batch.n_tokens    = params.max_length;
+
+    // Pre-allocate buffers for CFG if needed
+    int32_t                  logits_size = n_vocab * params.max_length;
+    std::vector<float>       cond_logits_buffer;
+    std::vector<llama_token> un_x_buffer;
+    if (params.cfg_scale > 0.0f) {
+        cond_logits_buffer.resize(logits_size);
+        un_x_buffer.resize(params.max_length);
+    }
+
+    // For block-based processing
+    std::vector<int32_t> num_transfer_tokens;
+    int32_t              num_blocks      = 1;
+    int32_t              steps_per_block = params.steps;
+
+    if (params.schedule == DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED) {
+        GGML_ASSERT(params.max_length % params.block_length == 0);
+        num_blocks = params.max_length / params.block_length;
+        GGML_ASSERT(params.steps % num_blocks == 0);
+        steps_per_block = params.steps / num_blocks;
+    }
+
+    std::vector<float> confidence(params.max_length);
+
+    int64_t total_sampling_time = 0;
+    int64_t total_time          = 0;
+    int64_t time_start          = ggml_time_us();
+
+    for (int block_num = 0; block_num < num_blocks; block_num++) {
+        int32_t block_start = (params.schedule == DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED) ? n_input + block_num * params.block_length : 0;
+        int32_t block_end   = (params.schedule == DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED) ?
+                                  std::min(n_input + (block_num + 1) * params.block_length, params.max_length) :
+                                  params.max_length;
+
+        // Count masked tokens in current block for block-based processing
+        if (params.schedule == DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED) {
+            int32_t block_mask_count = 0;
+            for (int i = block_start; i < block_end; i++) {
+                if (output_tokens[i] == params.mask_token_id) {
+                    block_mask_count++;
+                }
+            }
+            num_transfer_tokens = get_num_transfer_tokens(block_mask_count, steps_per_block);
+        }
+
+        for (int32_t step = 0; step < steps_per_block; step++) {
+            int32_t global_step = block_num * steps_per_block + step;
+
+            if (params.step_callback) {
+                if (!params.step_callback(
+                        global_step, params.steps, output_tokens, params.max_length, params.step_callback_user_data)) {
+                    break;
+                }
+            }
+
+            // Setup batch
+            for (int32_t i = 0; i < params.max_length; i++) {
+                batch.token[i]     = output_tokens[i];
+                batch.pos[i]       = i;
+                batch.n_seq_id[i]  = 1;
+                batch.seq_id[i][0] = 0;
+                batch.logits[i]    = 1;
+            }
+
+            float * logits = nullptr;
+
+            if (params.cfg_scale > 0.0f) {
+                int ret = llama_decode(ctx, batch);
+                if (ret != 0) {
+                    LOG_ERR("Failed to generate conditional");
+                    break;
+                }
+                float * cond_logits_ptr = llama_get_logits(ctx);
+                std::memcpy(cond_logits_buffer.data(), cond_logits_ptr, logits_size * sizeof(float));
+
+                // Unconditional generation (mask input)
+                std::copy(output_tokens, output_tokens + params.max_length, un_x_buffer.begin());
+                for (int32_t i = 0; i < n_input; i++) {
+                    un_x_buffer[i] = params.mask_token_id;
+                }
+
+                for (int32_t i = 0; i < params.max_length; i++) {
+                    batch.token[i] = un_x_buffer[i];
+                }
+                ret = llama_decode(ctx, batch);
+                if (ret != 0) {
+                    LOG_ERR("Failed to generate unconditional");
+                    break;
+                }
+                float * uncond_logits = llama_get_logits(ctx);
+
+                // Apply CFG
+                for (int32_t i = 0; i < logits_size; i++) {
+                    cond_logits_buffer[i] =
+                        uncond_logits[i] + (params.cfg_scale + 1.0f) * (cond_logits_buffer[i] - uncond_logits[i]);
+                }
+                logits = cond_logits_buffer.data();
+            } else {
+                int ret = llama_decode(ctx, batch);
+                if (ret != 0) {
+                    LOG_ERR("%s: failed to decode at step %d, ret = %d\n", __func__, global_step, ret);
+                    break;
+                }
+                logits = llama_get_logits(ctx);
+            }
+
+            if (!logits) {
+                LOG_ERR("%s: failed to get logits at step %d\n", __func__, global_step);
+                break;
+            }
+
+            auto get_logits_for_pos = [&](int32_t pos) -> const float * {
+                if (params.shift_logits) {
+                    return pos == 0 ? logits : logits + (pos - 1) * n_vocab;
+                }
+                return logits + pos * n_vocab;
+            };
+
+            int64_t time_start_sampling = ggml_time_us();
+
+            mask_positions.clear();
+            for (int32_t i = 0; i < params.max_length; i++) {
+                if (output_tokens[i] == params.mask_token_id) {
+                    // For block-based, only consider current block
+                    if (params.schedule != DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED || (i >= block_start && i < block_end)) {
+                        mask_positions.push_back(i);
+                    }
+                }
+            }
+
+            if (mask_positions.empty()) {
+                break;
+            }
+
+            if (params.add_gumbel_noise && params.temperature > 0.0f) {
+                add_gumbel_noise(logits, n_vocab, params.temperature, rng);
+            }
+
+            if (params.algorithm == DIFFUSION_ALGORITHM_ORIGIN) {
+                int32_t transfer_count = calculate_transfer_count(
+                    step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens);
+                float p_transfer = (float) transfer_count / mask_positions.size();
+
+                for (int32_t pos : mask_positions) {
+                    if (std::uniform_real_distribution<float>(0.0f, 1.0f)(rng) < p_transfer) {
+                        const float * pos_logits = get_logits_for_pos(pos);
+                        for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
+                            candidates[token_id].id    = token_id;
+                            candidates[token_id].logit = pos_logits[token_id];
+                            candidates[token_id].p     = 0.0f;
+                        }
+
+                        llama_token_data_array cur_p = {
+                            candidates.data(),
+                            (size_t) n_vocab,
+                            -1,
+                            false,
+                        };
+
+                        llama_sampler_apply(sampler, &cur_p);
+                        output_tokens[pos] = cur_p.data[cur_p.selected].id;
+                    }
+                }
+            } else {
+                std::vector<std::pair<float, int32_t>> confidences;
+                std::vector<llama_token>               sampled_tokens(mask_positions.size());
+
+                for (size_t i = 0; i < mask_positions.size(); i++) {
+                    int32_t       pos        = mask_positions[i];
+                    const float * pos_logits = get_logits_for_pos(pos);
+
+                    for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
+                        candidates[token_id].logit = pos_logits[token_id];
+                        candidates[token_id].p     = 0.0f;
+                        candidates[token_id].id    = token_id;
+                    }
+
+                    llama_token_data_array cur_p = {
+                        candidates.data(),
+                        candidates.size(),
+                        -1,
+                        false,
+                    };
+
+                    llama_sampler_apply(sampler, &cur_p);
+                    llama_token sampled_token = cur_p.data[cur_p.selected].id;
+
+                    float conf = calculate_confidence(cur_p, params.algorithm, rng);
+
+                    sampled_tokens[i] = sampled_token;
+                    confidences.emplace_back(conf, i);
+                }
+
+                int32_t transfer_count = calculate_transfer_count(
+                    step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens);
+
+                if (transfer_count > 0) {
+                    if (params.alg_temp == 0.0f) {
+                        std::partial_sort(confidences.begin(),
+                                          confidences.begin() + std::min(transfer_count, (int32_t) confidences.size()),
+                                          confidences.end(),
+                                          [](const std::pair<float, int32_t> & a, const std::pair<float, int32_t> & b) {
+                                              if (a.first != b.first) {
+                                                  return a.first > b.first;
+                                              }
+                                              return a.second < b.second;
+                                          });
+
+                        for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) {
+                            int32_t mask_idx   = confidences[i].second;
+                            int32_t pos        = mask_positions[mask_idx];
+                            output_tokens[pos] = sampled_tokens[mask_idx];
+                        }
+                    } else {
+                        conf_candidates.clear();
+                        for (size_t i = 0; i < confidences.size(); i++) {
+                            float conf_logit = confidences[i].first / params.alg_temp;
+                            conf_candidates.emplace_back(llama_token_data{ (int32_t) i, conf_logit, 0.0f });
+                        }
+
+                        llama_token_data_array conf_array = {
+                            conf_candidates.data(),
+                            conf_candidates.size(),
+                            -1,
+                            false,
+                        };
+
+                        for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) {
+                            llama_sampler_apply(dist_sampler, &conf_array);
+                            int32_t selected_idx = conf_array.selected;
+                            int32_t mask_idx     = selected_idx;
+                            int32_t pos          = mask_positions[mask_idx];
+                            output_tokens[pos]   = sampled_tokens[mask_idx];
+
+                            conf_candidates[selected_idx].p = 0.0f;
+                            conf_array.selected             = -1;
+                        }
+                    }
+                }
+            }
+
+            int64_t time_end_sampling = ggml_time_us();
+            total_sampling_time += time_end_sampling - time_start_sampling;
+        }
+    }
+
+    int64_t time_end = ggml_time_us();
+    total_time += time_end - time_start;
+
+    LOG_INF("\ntotal time: %0.2fms, time per step: %0.2fms, sampling time per step: %0.2fms\n",
+            total_time / 1000.0,
+            total_time / 1000.0 / params.steps,
+            total_sampling_time / 1000.0 / params.steps);
+
+    llama_batch_free(batch);
+    llama_sampler_free(sampler);
+    llama_sampler_free(dist_sampler);
+
+    n_generated = params.max_length;
+}
diff --git a/examples/diffusion/diffusion.h b/examples/diffusion/diffusion.h
new file mode 100644
index 000000000000..7831445224c9
--- /dev/null
+++ b/examples/diffusion/diffusion.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#include "llama.h"
+
+#include <cstdint>
+
+enum diffusion_algorithm {
+    DIFFUSION_ALGORITHM_ORIGIN           = 0,
+    DIFFUSION_ALGORITHM_ENTROPY_BASED    = 1,
+    DIFFUSION_ALGORITHM_MARGIN_BASED     = 2,
+    DIFFUSION_ALGORITHM_RANDOM           = 3,
+    DIFFUSION_ALGORITHM_CONFIDENCE_BASED = 4,
+};
+
+// Unified transfer scheduling methods
+enum diffusion_transfer_schedule {
+    DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED = 0,  // Dream-style: (1.0 - s/t) * remaining
+    DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED    = 1,  // LLaDA-style: process in blocks with get_num_transfer_tokens
+};
+
+typedef bool (*diffusion_step_callback_t)(int32_t             step,
+                                          int32_t             total_steps,
+                                          const llama_token * tokens,
+                                          int32_t             n_tokens,
+                                          void *              user_data);
+
+struct diffusion_params {
+    int32_t                   steps                   = 0;
+    float                     temperature             = 0;
+    llama_token               mask_token_id           = LLAMA_TOKEN_NULL;
+    diffusion_step_callback_t step_callback           = nullptr;
+    void *                    step_callback_user_data = nullptr;
+    int32_t                   seed                    = 0;
+    bool                      visual_mode             = false;
+    bool                      shift_logits            = false;  // Shift logits by -1 after decode
+
+    float   top_p = 0.;
+    int32_t top_k = 0.;
+
+    diffusion_algorithm         algorithm = DIFFUSION_ALGORITHM_CONFIDENCE_BASED;
+    diffusion_transfer_schedule schedule  = DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED;
+
+    float   cfg_scale        = 0.;     // Config scale for classifier-free guidance
+    float   eps              = 0.;     // Timestep scheduling
+    int32_t block_length     = 0;      // Block size (for block scheduling)
+    float   alg_temp         = 0;      // algorithm temperature (0.0 = deterministic)
+    bool    add_gumbel_noise = false;  // Add gumbel noise to the logits if temp > 0.0
+
+    int32_t max_length = 0;            // Maximum sequence length
+};
+
+void diffusion_generate(llama_context *          ctx,
+                        const llama_token *      input_tokens,
+                        llama_token *            output_tokens,
+                        int32_t                  n_input,
+                        const diffusion_params & params,
+                        int32_t &                n_generated);

From 935a3402923e79679e056f1d8fdc6bc48ec6dbbc Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Mon, 4 May 2026 16:23:26 +0200
Subject: [PATCH 11/13] server: implement /models?reload=1 (#21848)

---
 tools/server/README.md                 |   6 +-
 tools/server/server-models.cpp         | 334 ++++++++++++++++++-------
 tools/server/server-models.h           |   8 +
 tools/server/tests/unit/test_router.py |  48 ++++
 tools/server/tests/utils.py            |   5 +
 5 files changed, 313 insertions(+), 88 deletions(-)

diff --git a/tools/server/README.md b/tools/server/README.md
index a0cfdbb6fecc..62f918ce4a8c 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -1646,7 +1646,11 @@ Listing all models in cache. The model metadata will also include a field to ind
 }
 ```
 
-Note: For a local GGUF (stored offline in a custom directory), the model object will have `"in_cache": false`.
+Note:
+1. For a local GGUF (stored offline in a custom directory), the model object will have `"in_cache": false`.
+2. Adding `?reload=1` to the query params will refresh the list of models. The behavior is as follow:
+    - If a model is running but updated or removed from the source, it will be unloaded
+    - If a model is not running, it will be added or updated according to the source
 
 The `status` object can be:
 
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index db6cbce8f9d8..5a05ca203373 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -243,9 +243,8 @@ void server_models::add_model(server_model_meta && meta) {
     };
 }
 
-// TODO: allow refreshing cached model list
 void server_models::load_models() {
-    // loading models from 3 sources:
+    // Phase 1: load presets from all sources — pure I/O, no lock needed
     // 1. cached models
     common_presets cached_models = ctx_preset.load_from_cache();
     SRV_INF("Loaded %zu cached model presets\n", cached_models.size());
@@ -270,112 +269,266 @@ void server_models::load_models() {
 
     // note: if a model exists in both cached and local, local takes precedence
     common_presets final_presets;
-    for (const auto & [name, preset] : cached_models) {
-        final_presets[name] = preset;
-    }
-    for (const auto & [name, preset] : local_models) {
-        final_presets[name] = preset;
-    }
-
-    // process custom presets from INI
+    for (const auto & [name, preset] : cached_models) final_presets[name] = preset;
+    for (const auto & [name, preset] : local_models)  final_presets[name] = preset;
     for (const auto & [name, custom] : custom_presets) {
         if (final_presets.find(name) != final_presets.end()) {
-            // apply custom config if exists
-            common_preset & target = final_presets[name];
-            target.merge(custom);
+            final_presets[name].merge(custom);
         } else {
-            // otherwise add directly
             final_presets[name] = custom;
         }
     }
-
-    // server base preset from CLI args take highest precedence
+    // server base preset from CLI args takes highest precedence
     for (auto & [name, preset] : final_presets) {
         preset.merge(base_preset);
     }
 
-    // convert presets to server_model_meta and add to mapping
-    for (const auto & preset : final_presets) {
-        server_model_meta meta{
-            /* preset       */ preset.second,
-            /* name         */ preset.first,
-            /* aliases      */ {},
-            /* tags         */ {},
-            /* port         */ 0,
-            /* status       */ SERVER_MODEL_STATUS_UNLOADED,
-            /* last_used    */ 0,
-            /* args         */ std::vector<std::string>(),
-            /* exit_code    */ 0,
-            /* stop_timeout */ DEFAULT_STOP_TIMEOUT,
-        };
-        add_model(std::move(meta));
-    }
-
-    // log available models
-    {
-        std::unordered_set<std::string> custom_names;
-        for (const auto & [name, preset] : custom_presets) {
-            custom_names.insert(name);
-        }
-        auto join_set = [](const std::set<std::string> & s) {
-            std::string result;
-            for (const auto & v : s) {
-                if (!result.empty()) {
-                    result += ", ";
-                }
-                result += v;
-            }
-            return result;
-        };
-
+    // Helpers that read `mapping` — must be called while holding the lock.
+    std::unordered_set<std::string> custom_names;
+    for (const auto & [name, preset] : custom_presets) custom_names.insert(name);
+    auto join_set = [](const std::set<std::string> & s) {
+        std::string result;
+        for (const auto & v : s) {
+            if (!result.empty()) result += ", ";
+            result += v;
+        }
+        return result;
+    };
+    auto log_available_models = [&]() {
         SRV_INF("Available models (%zu) (*: custom preset)\n", mapping.size());
         for (const auto & [name, inst] : mapping) {
             bool has_custom = custom_names.find(name) != custom_names.end();
             std::string info;
-            if (!inst.meta.aliases.empty()) {
-                info += " (aliases: " + join_set(inst.meta.aliases) + ")";
+            if (!inst.meta.aliases.empty()) info += " (aliases: " + join_set(inst.meta.aliases) + ")";
+            if (!inst.meta.tags.empty())    info += " [tags: "    + join_set(inst.meta.tags)    + "]";
+            SRV_INF("  %c %s%s\n", has_custom ? '*' : ' ', name.c_str(), info.c_str());
+        }
+    };
+    auto apply_stop_timeout = [&]() {
+        for (auto & [name, inst] : mapping) {
+            std::string val;
+            if (inst.meta.preset.get_option(COMMON_ARG_PRESET_STOP_TIMEOUT, val)) {
+                try {
+                    inst.meta.stop_timeout = std::stoi(val);
+                } catch (...) {
+                    SRV_WRN("invalid stop-timeout value '%s' for model '%s', using default %d seconds\n",
+                        val.c_str(), name.c_str(), DEFAULT_STOP_TIMEOUT);
+                    inst.meta.stop_timeout = DEFAULT_STOP_TIMEOUT;
+                }
             }
-            if (!inst.meta.tags.empty()) {
-                info += " [tags: " + join_set(inst.meta.tags) + "]";
+        }
+    };
+    // update_args() injects HOST/PORT/ALIAS, so strip them before comparing presets
+    auto preset_options_for_compare = [](common_preset p) {
+        p.unset_option("LLAMA_ARG_HOST");
+        p.unset_option("LLAMA_ARG_PORT");
+        p.unset_option("LLAMA_ARG_ALIAS");
+        return p.options;
+    };
+
+    // Phase 2: acquire the lock once for all mapping mutations.
+    // We temporarily release it only when calling functions that acquire it internally
+    // (unload, load) or when joining threads (the monitoring thread calls update_status
+    // which locks the mutex, so joining while holding it would deadlock).
+    std::unique_lock<std::mutex> lk(mutex);
+    bool is_first_load = mapping.empty();
+
+    if (is_first_load) {
+        // FIRST LOAD: add all models, then unlock for autoloading
+        for (const auto & [name, preset] : final_presets) {
+            server_model_meta meta{
+                /* preset       */ preset,
+                /* name         */ name,
+                /* aliases      */ {},
+                /* tags         */ {},
+                /* port         */ 0,
+                /* status       */ SERVER_MODEL_STATUS_UNLOADED,
+                /* last_used    */ 0,
+                /* args         */ std::vector<std::string>(),
+                /* exit_code    */ 0,
+                /* stop_timeout */ DEFAULT_STOP_TIMEOUT,
+            };
+            add_model(std::move(meta));
+        }
+        apply_stop_timeout();
+        log_available_models();
+
+        std::vector<std::string> models_to_load;
+        for (const auto & [name, inst] : mapping) {
+            std::string val;
+            if (inst.meta.preset.get_option(COMMON_ARG_PRESET_LOAD_ON_STARTUP, val) && common_arg_utils::is_truthy(val)) {
+                models_to_load.push_back(name);
             }
-            SRV_INF("  %c %s%s\n", has_custom ? '*' : ' ', name.c_str(), info.c_str());
         }
-    }
+        if ((int)models_to_load.size() > base_params.models_max) {
+            throw std::runtime_error(string_format(
+                "number of models to load on startup (%zu) exceeds models_max (%d)",
+                models_to_load.size(), base_params.models_max));
+        }
 
-    // handle custom stop-timeout option
-    for (auto & [name, inst] : mapping) {
-        std::string val;
-        if (inst.meta.preset.get_option(COMMON_ARG_PRESET_STOP_TIMEOUT, val)) {
-            try {
-                inst.meta.stop_timeout = std::stoi(val);
-            } catch (...) {
-                SRV_WRN("invalid stop-timeout value '%s' for model '%s', using default %d seconds\n",
-                    val.c_str(), name.c_str(), DEFAULT_STOP_TIMEOUT);
-                inst.meta.stop_timeout = DEFAULT_STOP_TIMEOUT;
+        lk.unlock();
+        for (const auto & name : models_to_load) {
+            SRV_INF("(startup) loading model %s\n", name.c_str());
+            load(name);
+        }
+    } else {
+        // RELOAD: diff the new preset list against the current mapping and reconcile
+        is_reloading = true;
+
+        // find running models whose source was removed or whose preset changed
+        std::vector<std::string> to_unload;
+        for (const auto & [name, inst] : mapping) {
+            if (!inst.meta.is_running()) continue;
+            auto it = final_presets.find(name);
+            if (it == final_presets.end()) {
+                to_unload.push_back(name); // removed from source
+            } else if (preset_options_for_compare(inst.meta.preset) != preset_options_for_compare(it->second)) {
+                to_unload.push_back(name); // preset changed
             }
         }
-    }
 
-    // load any autoload models
-    std::vector<std::string> models_to_load;
-    for (const auto & [name, inst] : mapping) {
-        std::string val;
-        if (inst.meta.preset.get_option(COMMON_ARG_PRESET_LOAD_ON_STARTUP, val)) {
-            if (common_arg_utils::is_truthy(val)) {
-                models_to_load.push_back(name);
+        // unload() acquires the lock internally, so release before each call
+        for (const auto & name : to_unload) {
+            SRV_INF("(reload) unloading model name=%s (source updated or removed)\n", name.c_str());
+            lk.unlock();
+            unload(name);
+            lk.lock();
+        }
+
+        // wait for all targeted models to reach UNLOADED; cv.wait handles unlock/relock
+        cv.wait(lk, [&]() {
+            for (const auto & name : to_unload) {
+                auto it = mapping.find(name);
+                if (it != mapping.end() && it->second.meta.is_running()) return false;
+            }
+            return true;
+        });
+
+        // collect all threads to join in one pass while the lock is held:
+        // - monitoring threads from just-unloaded models (to_unload)
+        // - threads of already-UNLOADED models that are being removed from source
+        std::vector<std::thread> threads_to_join;
+        for (const auto & name : to_unload) {
+            auto it = mapping.find(name);
+            if (it != mapping.end() && it->second.th.joinable()) {
+                threads_to_join.push_back(std::move(it->second.th));
             }
         }
-    }
-    if ((int)models_to_load.size() > base_params.models_max) {
-        throw std::runtime_error(string_format(
-            "number of models to load on startup (%zu) exceeds models_max (%d)",
-            models_to_load.size(),
-            base_params.models_max
-        ));
-    }
-    for (const auto & name : models_to_load) {
-        SRV_INF("(startup) loading model %s\n", name.c_str());
-        load(name);
+        for (auto & [name, inst] : mapping) {
+            if (final_presets.find(name) == final_presets.end() && !inst.meta.is_running() && inst.th.joinable()) {
+                threads_to_join.push_back(std::move(inst.th));
+            }
+        }
+
+        // join outside the lock — monitoring thread calls update_status (needs lock)
+        lk.unlock();
+        for (auto & th : threads_to_join) th.join();
+        lk.lock();
+
+        // erase models no longer in any source
+        for (auto it = mapping.begin(); it != mapping.end(); ) {
+            if (final_presets.find(it->first) == final_presets.end()) {
+                SRV_INF("(reload) removing model name=%s (no longer in source)\n", it->first.c_str());
+                GGML_ASSERT(!it->second.th.joinable()); // must have been joined above
+                it = mapping.erase(it);
+            } else {
+                ++it;
+            }
+        }
+
+        // update presets for non-running models still in source
+        for (auto & [name, inst] : mapping) {
+            if (inst.meta.is_running()) continue;
+            auto it = final_presets.find(name);
+            if (it == final_presets.end()) continue; // erased above
+
+            inst.meta.preset = it->second;
+
+            // re-parse aliases, then validate against other models
+            std::set<std::string> new_aliases;
+            std::string alias_str;
+            if (inst.meta.preset.get_option("LLAMA_ARG_ALIAS", alias_str) && !alias_str.empty()) {
+                for (auto & alias : string_split<std::string>(alias_str, ',')) {
+                    alias = string_strip(alias);
+                    if (!alias.empty()) new_aliases.insert(alias);
+                }
+            }
+            inst.meta.aliases.clear();
+            for (const auto & alias : new_aliases) {
+                bool conflict = false;
+                for (const auto & [other_name, other_inst] : mapping) {
+                    if (other_name == name) continue;
+                    if (other_name == alias || other_inst.meta.aliases.count(alias)) {
+                        SRV_WRN("(reload) alias '%s' for model '%s' conflicts with model '%s', skipping\n",
+                            alias.c_str(), name.c_str(), other_name.c_str());
+                        conflict = true;
+                        break;
+                    }
+                }
+                if (!conflict) inst.meta.aliases.insert(alias);
+            }
+
+            // re-parse tags
+            inst.meta.tags.clear();
+            std::string tags_str;
+            if (inst.meta.preset.get_option("LLAMA_ARG_TAGS", tags_str) && !tags_str.empty()) {
+                for (auto & tag : string_split<std::string>(tags_str, ',')) {
+                    tag = string_strip(tag);
+                    if (!tag.empty()) inst.meta.tags.insert(tag);
+                }
+            }
+
+            inst.meta.exit_code = 0; // clear failed state so the model can be reloaded
+            inst.meta.update_args(ctx_preset, bin_path);
+        }
+
+        // add models that are new in this reload
+        std::vector<std::string> newly_added;
+        for (const auto & [name, preset] : final_presets) {
+            if (mapping.find(name) == mapping.end()) {
+                server_model_meta meta{
+                    /* preset       */ preset,
+                    /* name         */ name,
+                    /* aliases      */ {},
+                    /* tags         */ {},
+                    /* port         */ 0,
+                    /* status       */ SERVER_MODEL_STATUS_UNLOADED,
+                    /* last_used    */ 0,
+                    /* args         */ std::vector<std::string>(),
+                    /* exit_code    */ 0,
+                    /* stop_timeout */ DEFAULT_STOP_TIMEOUT,
+                };
+                add_model(std::move(meta));
+                newly_added.push_back(name);
+            }
+        }
+
+        apply_stop_timeout();
+
+        // clear reload flag before unlocking for autoload — load() blocks on !is_reloading,
+        // so clearing it here (while still locked) prevents a deadlock in the autoload calls below
+        is_reloading = false;
+        cv.notify_all();
+
+        log_available_models();
+
+        // collect autoload candidates while still under the lock
+        std::vector<std::string> to_autoload;
+        for (const auto & name : newly_added) {
+            auto it = mapping.find(name);
+            if (it != mapping.end()) {
+                std::string val;
+                if (it->second.meta.preset.get_option(COMMON_ARG_PRESET_LOAD_ON_STARTUP, val) && common_arg_utils::is_truthy(val)) {
+                    to_autoload.push_back(name);
+                }
+            }
+        }
+
+        lk.unlock();
+        for (const auto & name : to_autoload) {
+            SRV_INF("(reload) loading new model %s\n", name.c_str());
+            load(name);
+        }
     }
 }
 
@@ -536,7 +689,10 @@ void server_models::load(const std::string & name) {
     }
     unload_lru();
 
-    std::lock_guard<std::mutex> lk(mutex);
+    std::unique_lock<std::mutex> lk(mutex);
+    // edge case: block until any in-progress reload has finished so we always load
+    // against the freshest preset and a consistent mapping state
+    cv.wait(lk, [this]() { return !is_reloading; });
 
     auto meta = mapping[name].meta;
     if (meta.status != SERVER_MODEL_STATUS_UNLOADED) {
@@ -993,7 +1149,11 @@ void server_models_routes::init_routes() {
         return res;
     };
 
-    this->get_router_models = [this](const server_http_req &) {
+    this->get_router_models = [this](const server_http_req & req) {
+        bool reload = !req.get_param("reload", "").empty();
+        if (reload) {
+            models.load_models();
+        }
         auto res = std::make_unique<server_http_res>();
         json models_json = json::array();
         auto all_models = models.get_all_meta();
diff --git a/tools/server/server-models.h b/tools/server/server-models.h
index b3428ef54475..64a15f5ba404 100644
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -100,6 +100,9 @@ struct server_models {
     std::condition_variable cv_stop;
     std::set<std::string> stopping_models;
 
+    // set to true while load_models() is executing a reload; load() will wait until clear
+    bool is_reloading = false;
+
     common_preset_context ctx_preset;
 
     common_params base_params;
@@ -118,6 +121,11 @@ struct server_models {
 public:
     server_models(const common_params & params, int argc, char ** argv);
 
+    // (re-)load the list of models from various sources and prepare the metadata mapping
+    // - if this is called the first time, simply populate the metadata
+    // - if this is called subsequently (e.g. when refreshing from disk):
+    //   - if a model is running but updated or removed from the source, it will be unloaded
+    //   - if a model is not running, it will be added or updated according to the source
     void load_models();
 
     // check if a model instance exists (thread-safe)
diff --git a/tools/server/tests/unit/test_router.py b/tools/server/tests/unit/test_router.py
index 79e60db40830..c93b92b0b2ee 100644
--- a/tools/server/tests/unit/test_router.py
+++ b/tools/server/tests/unit/test_router.py
@@ -62,6 +62,12 @@ def test_router_chat_completion_stream(model: str, success: bool):
         assert content == ""
 
 
+def _get_model_ids(is_reload: bool) -> set[str]:
+    res = server.make_request("GET", "/models" + ("?reload=1" if is_reload else ""))
+    assert res.status_code == 200
+    return {item["id"] for item in res.body.get("data", [])}
+
+
 def _get_model_status(model_id: str) -> str:
     res = server.make_request("GET", "/models")
     assert res.status_code == 200
@@ -205,3 +211,45 @@ def test_router_api_key_required():
     )
     assert authed.status_code == 200
     assert "error" not in authed.body
+
+
+def test_router_reload_models():
+    """POST /models/reload re-reads the INI preset and updates the model list."""
+    global server
+
+    preset_path = os.path.join(TMP_DIR, "test_reload.ini")
+
+    # Initial preset: two models
+    with open(preset_path, "w") as f:
+        f.write(
+            "[model-reload-a]\n"
+            "hf-repo = ggml-org/test-model-stories260K\n"
+            "\n"
+            "[model-reload-b]\n"
+            "hf-repo = ggml-org/test-model-stories260K-infill\n"
+        )
+
+    server.models_preset = preset_path
+    server.start()
+
+    ids = _get_model_ids(is_reload=False)
+    assert "model-reload-a" in ids
+    assert "model-reload-b" in ids
+
+    # Updated preset: remove a, keep b unchanged, add c
+    with open(preset_path, "w") as f:
+        f.write(
+            "[model-reload-b]\n"
+            "hf-repo = ggml-org/test-model-stories260K-infill\n"
+            "\n"
+            "[model-reload-c]\n"
+            "hf-repo = ggml-org/test-model-stories260K\n"
+        )
+
+    try:
+        ids = _get_model_ids(is_reload=True)
+        assert "model-reload-a" not in ids, "removed model should no longer appear"
+        assert "model-reload-b" in ids, "unchanged model should still appear"
+        assert "model-reload-c" in ids, "newly added model should appear"
+    finally:
+        os.remove(preset_path)
diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py
index 88700487be85..15f9bd95d735 100644
--- a/tools/server/tests/utils.py
+++ b/tools/server/tests/utils.py
@@ -5,6 +5,8 @@
 
 import subprocess
 import os
+
+TMP_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tmp")
 import re
 import json
 from json import JSONDecodeError
@@ -86,6 +88,7 @@ class ServerProcess:
     api_key: str | None = None
     models_dir: str | None = None
     models_max: int | None = None
+    models_preset: str | None = None
     no_models_autoload: bool | None = None
     lora_files: List[str] | None = None
     enable_ctx_shift: int | None = False
@@ -156,6 +159,8 @@ def start(self, timeout_seconds: int = DEFAULT_HTTP_TIMEOUT) -> None:
             server_args.extend(["--models-dir", self.models_dir])
         if self.models_max is not None:
             server_args.extend(["--models-max", self.models_max])
+        if self.models_preset:
+            server_args.extend(["--models-preset", self.models_preset])
         if self.n_batch:
             server_args.extend(["--batch-size", self.n_batch])
         if self.n_ubatch:

From e77056f9b25c6aab8dcc409879ac0b81b4fb120d Mon Sep 17 00:00:00 2001
From: leonardHONG <2695316095@qq.com>
Date: Mon, 4 May 2026 22:24:05 +0800
Subject: [PATCH 12/13] CUDA: use fastdiv for batch index split in get_rows
 (#22650)

---
 ggml/src/ggml-cuda/getrows.cu | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-cuda/getrows.cu b/ggml/src/ggml-cuda/getrows.cu
index e99cba63d344..36b840e81484 100644
--- a/ggml/src/ggml-cuda/getrows.cu
+++ b/ggml/src/ggml-cuda/getrows.cu
@@ -6,17 +6,18 @@ template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
 static __global__ void k_get_rows(
         const void * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst,
         const int64_t ne00, /*const int64_t ne01, const int64_t ne02, const int64_t ne03,*/
-        /*const int64_t ne10,*/ const int64_t ne11, const int64_t ne12, /*const int64_t ne13,*/
+        /*const int64_t ne10,*/ const int64_t ne11, const uint3 ne12_fdv, /*const int64_t ne13,*/
         /*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3,
         /*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
         const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
 
-    for (int64_t z = blockIdx.z; z < ne11*ne12; z += gridDim.z) {
+    for (int64_t z = blockIdx.z; z < ne11*(int64_t)ne12_fdv.z; z += gridDim.z) {
         for (int64_t i00 = 2*(blockIdx.y*blockDim.x + threadIdx.x); i00 < ne00; i00 += gridDim.y*blockDim.x) {
             // The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
             const int i10 =  blockIdx.x;
-            const int i11 =  z / ne12; // TODO fastdiv
-            const int i12 =  z % ne12;
+            const uint2 dm  = fast_div_modulo((uint32_t)z, ne12_fdv);
+            const int i11 =  dm.x;
+            const int i12 =  dm.y;
 
             const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
 
@@ -42,17 +43,18 @@ template<typename src0_t, typename dst_t>
 static __global__ void k_get_rows_float(
         const src0_t * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst,
         const int64_t ne00, /*const int64_t ne01, const int64_t ne02, const int64_t ne03,*/
-        /*const int64_t ne10,*/ const int64_t ne11, const int64_t ne12, /*const int64_t ne13,*/
+        /*const int64_t ne10,*/ const int64_t ne11, const uint3 ne12_fdv, /*const int64_t ne13,*/
         /*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3,
         /*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
         const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
 
-    for (int64_t z = blockIdx.z; z < ne11*ne12; z += gridDim.z) {
+    for (int64_t z = blockIdx.z; z < ne11*(int64_t)ne12_fdv.z; z += gridDim.z) {
         for (int64_t i00 = blockIdx.y*blockDim.x + threadIdx.x; i00 < ne00; i00 += gridDim.y*blockDim.x) {
             // The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
             const int i10 = blockIdx.x;
-            const int i11 = z / ne12; // TODO fastdiv
-            const int i12 = z % ne12;
+            const uint2 dm = fast_div_modulo((uint32_t)z, ne12_fdv);
+            const int i11 = dm.x;
+            const int i12 = dm.y;
 
             if (i00 >= ne00) {
                 return;
@@ -115,10 +117,14 @@ static void get_rows_cuda_q(
 
     GGML_ASSERT(ne00 % 2 == 0);
 
+    GGML_ASSERT(ne12 > 0);
+    GGML_ASSERT(ne11 <= std::numeric_limits<uint32_t>::max() / ne12);
+    const uint3 ne12_fdv = init_fastdiv_values(ne12);
+
     k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
         src0_d, src1_d, dst_d,
         ne00, /*ne01, ne02, ne03,*/
-        /*ne10,*/ ne11, ne12, /*ne13,*/
+        /*ne10,*/ ne11, ne12_fdv, /*ne13,*/
         /* s0,*/ s1, s2, s3,
         /* nb00,*/ nb01, nb02, nb03,
         s10, s11, s12/*, s13*/);
@@ -146,10 +152,14 @@ static void get_rows_cuda_float(
     const size_t s12 = nb12 / sizeof(int32_t);
     // const size_t s13 = nb13 / sizeof(int32_t);
 
+    GGML_ASSERT(ne12 > 0);
+    GGML_ASSERT(ne11 <= std::numeric_limits<uint32_t>::max() / ne12);
+    const uint3 ne12_fdv = init_fastdiv_values(ne12);
+
     k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
         src0_d, src1_d, dst_d,
         ne00, /*ne01, ne02, ne03,*/
-        /*ne10,*/ ne11, ne12, /*ne13,*/
+        /*ne10,*/ ne11, ne12_fdv, /*ne13,*/
         /* s0,*/ s1, s2, s3,
         /* nb00,*/ nb01, nb02, nb03,
         s10, s11, s12/*, s13*/);

From eff06702b2a52e1020ea009ebd86cb9f5acabab5 Mon Sep 17 00:00:00 2001
From: Charles Xu <charles.xu@arm.com>
Date: Mon, 4 May 2026 21:13:31 +0200
Subject: [PATCH 13/13] kleidiai : update to v1.24.0 and use release archive
 (#22549)

---
 ggml/src/ggml-cpu/CMakeLists.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
index c1c225f01979..869c7b238bf6 100644
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -578,13 +578,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
 
         # Fetch KleidiAI sources:
         include(FetchContent)
-        set(KLEIDIAI_COMMIT_TAG "v1.22.0")
-        set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
-        set(KLEIDIAI_ARCHIVE_MD5  "54049037570ab0ee0a0d126b2ba5ece1")
+        set(KLEIDIAI_COMMIT_TAG "v1.24.0")
+        set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/releases/download/${KLEIDIAI_COMMIT_TAG}/kleidiai-${KLEIDIAI_COMMIT_TAG}-src.tar.gz")
+        set(KLEIDIAI_RELEASE_ARCHIVE_MD5  "2f02ebe29573d45813e671eb304f2a00")
 
         set(KLEIDIAI_FETCH_ARGS
             URL ${KLEIDIAI_DOWNLOAD_URL}
-            URL_HASH MD5=${KLEIDIAI_ARCHIVE_MD5}
+            URL_HASH MD5=${KLEIDIAI_RELEASE_ARCHIVE_MD5}
         )
         if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
             list(APPEND KLEIDIAI_FETCH_ARGS DOWNLOAD_EXTRACT_TIMESTAMP NEW)