quantumaikr
diff --git a/‎include/turboquant/tq_engine.h‎
Lines changed: 52 additions & 10 deletions b/‎include/turboquant/tq_engine.h‎
Lines changed: 52 additions & 10 deletions
diff --git a/‎src/engine/tq_generate.c‎
Lines changed: 8 additions & 3 deletions b/‎src/engine/tq_generate.c‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎src/engine/tq_model.c‎
Lines changed: 172 additions & 9 deletions b/‎src/engine/tq_model.c‎
Lines changed: 172 additions & 9 deletions
@@ -16,28 +16,57 @@ typedef struct {
     int n_layers;
     int hidden_dim;
     int intermediate_dim;
-    int n_heads;         /* query heads */
-    int n_kv_heads;      /* KV heads (GQA) */
-    int head_dim;
+    int n_heads;         /* query heads (for self_attn layers) */
+    int n_kv_heads;      /* KV heads (GQA, for self_attn layers) */
+    int head_dim;        /* head dimension for self_attn */
     int vocab_size;
     int max_seq_len;
     float rope_freq_base;
     float rms_norm_eps;
+
+    /* DeltaNet (linear_attention) config */
+    int delta_n_heads;       /* number of DeltaNet heads (e.g., 16) */
+    int delta_key_head_dim;  /* key head dim (e.g., 128) */
+    int delta_value_head_dim;/* value head dim (e.g., 128) */
+    int delta_conv_width;    /* conv1d kernel width (e.g., 4) */
+    float partial_rotary_factor; /* fraction of head_dim that uses RoPE (e.g., 0.25) */
+
+    /* QK-norm for self_attn (Qwen3.5 style) */
+    int use_qk_norm;         /* 1 if q_norm/k_norm weights present */
+    int attn_output_gate;    /* 1 if q_proj includes output gate (doubled q_proj output) */
 } tq_model_config_t;
 
 /* ============================================================
  * Model weights (in memory)
  * ============================================================ */
 typedef struct {
-    float* attn_norm;     /* [hidden_dim] */
-    float* ffn_norm;      /* [hidden_dim] */
+    /* RMSNorm weights */
+    float* attn_norm;     /* [hidden_dim] input_layernorm */
+    float* ffn_norm;      /* [hidden_dim] post_attention_layernorm */
+
+    /* Standard self_attn weights (NULL for DeltaNet layers) */
     float* wq;            /* [n_heads * head_dim, hidden_dim] */
     float* wk;            /* [n_kv_heads * head_dim, hidden_dim] */
     float* wv;            /* [n_kv_heads * head_dim, hidden_dim] */
     float* wo;            /* [hidden_dim, n_heads * head_dim] */
+    float* q_norm;        /* [head_dim] QK-norm for queries */
+    float* k_norm;        /* [head_dim] QK-norm for keys */
+
+    /* SwiGLU FFN weights (present on ALL layers) */
     float* w_gate;        /* [intermediate_dim, hidden_dim] */
     float* w_up;          /* [intermediate_dim, hidden_dim] */
     float* w_down;        /* [hidden_dim, intermediate_dim] */
+
+    /* DeltaNet (linear_attention) weights (NULL for self_attn layers) */
+    float* delta_a_log;       /* [delta_n_heads] decay parameter (log scale) */
+    float* delta_conv1d;      /* [qkv_dim, 1, conv_width] */
+    float* delta_dt_bias;     /* [delta_n_heads] delta bias */
+    float* delta_in_proj_a;   /* [delta_n_heads, hidden_dim] */
+    float* delta_in_proj_b;   /* [delta_n_heads, hidden_dim] */
+    float* delta_in_proj_qkv; /* [qkv_dim, hidden_dim] (qkv_dim = 3 * delta_n_heads * delta_key_head_dim) */
+    float* delta_in_proj_z;   /* [z_dim, hidden_dim] (z_dim = delta_n_heads * delta_value_head_dim) */
+    float* delta_norm;        /* [delta_value_head_dim] group norm weight */
+    float* delta_out_proj;    /* [hidden_dim, z_dim] */
 } tq_layer_weights_t;
 
 typedef struct {
@@ -80,12 +109,22 @@ typedef struct {
     float* hb2;         /* [intermediate_dim] FFN buffer 2 */
     float* logits;      /* [vocab_size] output logits */
 
-    /* KV cache — FP32 for values, quantized for keys via TurboQuant */
+    /* KV cache for self_attn layers */
     float* key_cache;    /* [n_layers, max_seq_len, n_kv_heads * head_dim] */
     float* value_cache;  /* [n_layers, max_seq_len, n_kv_heads * head_dim] */
     tq_type kv_quant_type; /* quantization type for KV attention */
     size_t kv_cache_size;
 
+    /* DeltaNet recurrent state */
+    float* delta_state;  /* [n_layers, delta_n_heads, key_head_dim, value_head_dim] */
+    float* conv_state;   /* [n_layers, qkv_dim, conv_width-1] */
+
+    /* DeltaNet workspace buffers */
+    float* delta_qkv;    /* [qkv_dim] workspace for QKV projection */
+    float* delta_z;      /* [z_dim] workspace for Z gate */
+    float* delta_ab;     /* [delta_n_heads * 2] workspace for a,b projections */
+    float* delta_out;    /* [z_dim] workspace for output */
+
     /* Quantization workspace */
     void* quant_key_buf;    /* workspace for quantized keys */
     float* quant_score_buf; /* workspace for quantized attention scores */
@@ -109,12 +148,15 @@ typedef struct {
  * Tokenizer
  * ============================================================ */
 typedef struct {
-    char** vocab;        /* token strings */
-    float* scores;       /* BPE merge scores */
-    int vocab_size;
+    char** vocab;        /* token strings, indexed by token_id */
+    float* scores;       /* BPE merge scores (merge priority) */
+    int vocab_size;      /* total vocab capacity (max_id + 1) */
     int max_token_len;
-    /* Sorted vocab for encoding */
+    int n_merges;        /* number of BPE merges loaded */
+    /* Sorted vocab for encoding (binary search by string) */
     int* sorted_indices;
+    /* Merge table: pairs of token IDs that merge into a result */
+    int* merge_pairs;    /* [n_merges * 3]: (token_a, token_b, result_id) */
 } tq_tokenizer_t;
 
 /* ============================================================
 
@@ -178,12 +178,17 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
     int output_pos = 0;
     int prev_token = prompt_tokens[n_prompt - 1];
 
-    /* EOS token IDs — common values */
-    int eos_token = 2;
+    /* EOS token IDs — check common values.
+     * Qwen3.5: eos = 248044 (<|endoftext|>), also 248046 (<|im_end|>)
+     * LLaMA: eos = 2 */
+    int eos_token1 = 2;       /* LLaMA convention */
+    int eos_token2 = 248044;  /* Qwen <|endoftext|> */
+    int eos_token3 = 248046;  /* Qwen <|im_end|> */
 
     /* Generate loop */
     while (generated < config->max_tokens) {
-        if (next_token == eos_token) break;
+        if (next_token == eos_token1 || next_token == eos_token2 ||
+            next_token == eos_token3) break;
         if (pos >= model->config.max_seq_len) break;
 
         /* Decode token to text */
 
@@ -699,19 +699,106 @@ tq_model_t* tq_load_model(const char* path) {
     if (wq0 && wk0) {
         int q_out = (int)wq0->shape[0];
         int k_out = (int)wk0->shape[0];
-        /* Common head_dim values: 64, 128 */
-        /* Try head_dim = 128, then 64, then 96 */
-        int head_dim = 128;
-        if (q_out % head_dim != 0) head_dim = 64;
-        if (q_out % head_dim != 0) head_dim = 96;
+
+        /* Try to detect head_dim from q_norm weight if available */
+        snprintf(name_buf, sizeof(name_buf),
+                 "model.layers.%d.self_attn.q_norm.weight", probe_layer);
+        tensor_info_t* qn0 = find_tensor(tensors, n_tensors, name_buf);
+        int head_dim;
+        if (qn0 && qn0->n_dims >= 1) {
+            head_dim = (int)qn0->shape[0];
+            model->config.use_qk_norm = 1;
+        } else {
+            /* Common head_dim values: 128, 64, 96, 256 */
+            head_dim = 128;
+            if (q_out % head_dim != 0) head_dim = 64;
+            if (q_out % head_dim != 0) head_dim = 96;
+            if (q_out % head_dim != 0) head_dim = 256;
+            model->config.use_qk_norm = 0;
+        }
         model->config.head_dim = head_dim;
-        model->config.n_heads = q_out / head_dim;
         model->config.n_kv_heads = k_out / head_dim;
+
+        /* Detect attn_output_gate: if q_proj output is exactly 2x k_proj
+         * output * (n_heads/n_kv_heads ratio), then q_proj includes a gate.
+         * More precisely: q_out = n_heads * head_dim * (1 + gate).
+         * Compare against o_proj input dim to determine n_heads. */
+        snprintf(name_buf, sizeof(name_buf),
+                 "model.layers.%d.self_attn.o_proj.weight", probe_layer);
+        tensor_info_t* wo0 = find_tensor(tensors, n_tensors, name_buf);
+        if (wo0 && wo0->n_dims >= 2) {
+            int o_in = (int)wo0->shape[1]; /* o_proj is [hidden_dim, n_heads*head_dim] */
+            int n_heads_from_o = o_in / head_dim;
+            if (q_out == n_heads_from_o * head_dim * 2) {
+                /* q_proj is doubled: [Q, gate_q] */
+                model->config.attn_output_gate = 1;
+                model->config.n_heads = n_heads_from_o;
+                fprintf(stderr, "tq_load_model: detected attn_output_gate=1 "
+                        "(q_proj=%d = 2 * %d * %d)\n",
+                        q_out, n_heads_from_o, head_dim);
+            } else {
+                model->config.attn_output_gate = 0;
+                model->config.n_heads = q_out / head_dim;
+            }
+        } else {
+            model->config.attn_output_gate = 0;
+            model->config.n_heads = q_out / head_dim;
+        }
     } else {
         /* Defaults for small models */
         model->config.head_dim = 64;
         model->config.n_heads = model->config.hidden_dim / 64;
         model->config.n_kv_heads = model->config.n_heads;
+        model->config.use_qk_norm = 0;
+        model->config.attn_output_gate = 0;
+    }
+
+    /* Detect DeltaNet config from first linear_attn layer */
+    model->config.delta_n_heads = 0;
+    model->config.delta_key_head_dim = 0;
+    model->config.delta_value_head_dim = 0;
+    model->config.delta_conv_width = 4;
+    model->config.partial_rotary_factor = 0.0f;
+    {
+        /* Find first DeltaNet layer */
+        int delta_layer = -1;
+        for (int l = 0; l < model->config.n_layers; l++) {
+            snprintf(name_buf, sizeof(name_buf),
+                     "model.layers.%d.linear_attn.A_log", l);
+            if (find_tensor(tensors, n_tensors, name_buf)) {
+                delta_layer = l;
+                break;
+            }
+        }
+        if (delta_layer >= 0) {
+            snprintf(name_buf, sizeof(name_buf),
+                     "model.layers.%d.linear_attn.A_log", delta_layer);
+            tensor_info_t* a_log = find_tensor(tensors, n_tensors, name_buf);
+            if (a_log) {
+                model->config.delta_n_heads = (int)a_log->shape[0];
+            }
+
+            snprintf(name_buf, sizeof(name_buf),
+                     "model.layers.%d.linear_attn.in_proj_qkv.weight", delta_layer);
+            tensor_info_t* qkv_proj = find_tensor(tensors, n_tensors, name_buf);
+            if (qkv_proj && model->config.delta_n_heads > 0) {
+                int qkv_dim = (int)qkv_proj->shape[0];
+                /* qkv_dim = 3 * n_heads * head_dim */
+                model->config.delta_key_head_dim = qkv_dim / (3 * model->config.delta_n_heads);
+                model->config.delta_value_head_dim = model->config.delta_key_head_dim;
+            }
+
+            snprintf(name_buf, sizeof(name_buf),
+                     "model.layers.%d.linear_attn.conv1d.weight", delta_layer);
+            tensor_info_t* conv = find_tensor(tensors, n_tensors, name_buf);
+            if (conv && conv->n_dims >= 3) {
+                model->config.delta_conv_width = (int)conv->shape[2];
+            }
+
+            fprintf(stderr, "tq_load_model: DeltaNet config — %d heads, key_dim=%d, val_dim=%d, conv_w=%d\n",
+                    model->config.delta_n_heads, model->config.delta_key_head_dim,
+                    model->config.delta_value_head_dim, model->config.delta_conv_width);
+        }
     }
 
     /* Detect intermediate_dim from gate projection (use probe_layer) */
@@ -730,10 +817,18 @@ tq_model_t* tq_load_model(const char* path) {
         model->config.intermediate_dim = model->config.hidden_dim * 4;
     }
 
-    /* Defaults */
+    /* Defaults — tuned for Qwen3.5 if DeltaNet detected */
     model->config.max_seq_len = 4096;
-    model->config.rope_freq_base = 10000.0f;
-    model->config.rms_norm_eps = 1e-5f;
+    if (model->config.delta_n_heads > 0) {
+        /* Qwen3.5 uses rope_theta=10M, rms_norm_eps=1e-6, partial_rotary=0.25 */
+        model->config.rope_freq_base = 10000000.0f;
+        model->config.rms_norm_eps = 1e-6f;
+        model->config.partial_rotary_factor = 0.25f;
+    } else {
+        model->config.rope_freq_base = 10000.0f;
+        model->config.rms_norm_eps = 1e-5f;
+        model->config.partial_rotary_factor = 0.0f;
+    }
 
     /* Allocate layer weight pointers */
     int n_layers = model->config.n_layers;
@@ -791,6 +886,74 @@ tq_model_t* tq_load_model(const char* path) {
                                  find_tensor(tensors, n_tensors, name_buf),
                                  &conv_buf, &conv_used, conv_capacity);
 
+        /* QK-norm weights (Qwen3.5 style) */
+        snprintf(name_buf, sizeof(name_buf),
+                 "model.layers.%d.self_attn.q_norm.weight", l);
+        layer->q_norm = load_tensor(data_base,
+                                     find_tensor(tensors, n_tensors, name_buf),
+                                     &conv_buf, &conv_used, conv_capacity);
+
+        snprintf(name_buf, sizeof(name_buf),
+                 "model.layers.%d.self_attn.k_norm.weight", l);
+        layer->k_norm = load_tensor(data_base,
+                                     find_tensor(tensors, n_tensors, name_buf),
+                                     &conv_buf, &conv_used, conv_capacity);
+
+        /* DeltaNet (linear_attention) weights */
+        snprintf(name_buf, sizeof(name_buf),
+                 "model.layers.%d.linear_attn.A_log", l);
+        layer->delta_a_log = load_tensor(data_base,
+                                          find_tensor(tensors, n_tensors, name_buf),
+                                          &conv_buf, &conv_used, conv_capacity);
+
+        snprintf(name_buf, sizeof(name_buf),
+                 "model.layers.%d.linear_attn.conv1d.weight", l);
+        layer->delta_conv1d = load_tensor(data_base,
+                                           find_tensor(tensors, n_tensors, name_buf),
+                                           &conv_buf, &conv_used, conv_capacity);
+
+        snprintf(name_buf, sizeof(name_buf),
+                 "model.layers.%d.linear_attn.dt_bias", l);
+        layer->delta_dt_bias = load_tensor(data_base,
+                                            find_tensor(tensors, n_tensors, name_buf),
+                                            &conv_buf, &conv_used, conv_capacity);
+
+        snprintf(name_buf, sizeof(name_buf),
+                 "model.layers.%d.linear_attn.in_proj_a.weight", l);
+        layer->delta_in_proj_a = load_tensor(data_base,
+                                              find_tensor(tensors, n_tensors, name_buf),
+                                              &conv_buf, &conv_used, conv_capacity);
+
+        snprintf(name_buf, sizeof(name_buf),
+                 "model.layers.%d.linear_attn.in_proj_b.weight", l);
+        layer->delta_in_proj_b = load_tensor(data_base,
+                                              find_tensor(tensors, n_tensors, name_buf),
+                                              &conv_buf, &conv_used, conv_capacity);
+
+        snprintf(name_buf, sizeof(name_buf),
+                 "model.layers.%d.linear_attn.in_proj_qkv.weight", l);
+        layer->delta_in_proj_qkv = load_tensor(data_base,
+                                                find_tensor(tensors, n_tensors, name_buf),
+                                                &conv_buf, &conv_used, conv_capacity);
+
+        snprintf(name_buf, sizeof(name_buf),
+                 "model.layers.%d.linear_attn.in_proj_z.weight", l);
+        layer->delta_in_proj_z = load_tensor(data_base,
+                                              find_tensor(tensors, n_tensors, name_buf),
+                                              &conv_buf, &conv_used, conv_capacity);
+
+        snprintf(name_buf, sizeof(name_buf),
+                 "model.layers.%d.linear_attn.norm.weight", l);
+        layer->delta_norm = load_tensor(data_base,
+                                         find_tensor(tensors, n_tensors, name_buf),
+                                         &conv_buf, &conv_used, conv_capacity);
+
+        snprintf(name_buf, sizeof(name_buf),
+                 "model.layers.%d.linear_attn.out_proj.weight", l);
+        layer->delta_out_proj = load_tensor(data_base,
+                                             find_tensor(tensors, n_tensors, name_buf),
+                                             &conv_buf, &conv_used, conv_capacity);
+
         /* FFN: gate, up, down projections (SwiGLU) */
         snprintf(name_buf, sizeof(name_buf),
                  "model.layers.%d.mlp.gate_proj.weight", l);