quantumaikr
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 1 deletion b/‎.gitignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎include/turboquant/tq_engine.h‎
Lines changed: 25 additions & 2 deletions b/‎include/turboquant/tq_engine.h‎
Lines changed: 25 additions & 2 deletions
diff --git a/‎scripts/release_notes_v0.1.0.md‎
Lines changed: 37 additions & 0 deletions b/‎scripts/release_notes_v0.1.0.md‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎src/engine/tq_model.c‎
Lines changed: 158 additions & 2 deletions b/‎src/engine/tq_model.c‎
Lines changed: 158 additions & 2 deletions
diff --git a/‎src/engine/tq_ops.c‎
Lines changed: 13 additions & 0 deletions b/‎src/engine/tq_ops.c‎
Lines changed: 13 additions & 0 deletions
@@ -39,4 +39,5 @@ spec/test_vectors/*.bin
 refs/.venv/
 .cache/
 *.tqm
-.venv/
+.venv/
+refs/
@@ -34,6 +34,13 @@ typedef struct {
     /* QK-norm for self_attn (Qwen3.5 style) */
     int use_qk_norm;         /* 1 if q_norm/k_norm weights present */
     int attn_output_gate;    /* 1 if q_proj includes output gate (doubled q_proj output) */
+
+    /* Multi-architecture support */
+    int model_type;          /* 0=qwen35, 1=gemma3 */
+    int sliding_window;      /* sliding window size (512 for gemma3, 0 for unlimited) */
+    float rope_local_base_freq; /* RoPE base freq for local/sliding layers (10000.0 for gemma3) */
+    int n_norms_per_block;   /* 2 for qwen35, 4 for gemma3 */
+    float query_pre_attn_scalar; /* attention scaling: 1/sqrt(this) instead of 1/sqrt(head_dim), 0=use head_dim */
 } tq_model_config_t;
 
 /* ============================================================
@@ -52,6 +59,11 @@ typedef struct {
     float* q_norm;        /* [head_dim] QK-norm for queries */
     float* k_norm;        /* [head_dim] QK-norm for keys */
 
+    /* Gemma3 extra norms (NULL for Qwen3.5) */
+    float* post_attn_norm;   /* [hidden_dim] post_attention_layernorm (Gemma3 only) */
+    float* pre_ffn_norm;     /* [hidden_dim] pre_feedforward_layernorm (Gemma3 only) */
+    float* post_ffn_norm;    /* [hidden_dim] post_feedforward_layernorm (Gemma3 only) */
+
     /* SwiGLU FFN weights (present on ALL layers) */
     float* w_gate;        /* [intermediate_dim, hidden_dim] */
     float* w_up;          /* [intermediate_dim, hidden_dim] */
@@ -128,6 +140,9 @@ typedef struct {
     int n_attn_layers;        /* number of layers with standard self_attn */
     int* attn_layer_indices;  /* which layer indices have self_attn [n_attn_layers] */
 
+    /* Gemma3 sliding window support */
+    int* layer_is_sliding;    /* [n_layers] per-layer flag: 1=sliding, 0=global (NULL if not used) */
+
     /* Q4 output weight (lm_head) — runtime quantized for fast logit projection */
     uint8_t* output_qs;       /* [vocab_size * n_blocks * 16] Q4 packed nibbles */
     float* output_scales;     /* [vocab_size * n_blocks] Q4 block scales */
@@ -278,9 +293,16 @@ typedef struct {
     int32_t n_attn_layers;
     int32_t attn_layer_indices[64]; /* which layers are self_attn (max 64) */
 
+    /* Multi-architecture support (Gemma3) */
+    int32_t model_type;       /* 0=qwen35, 1=gemma3 */
+    int32_t sliding_window;   /* sliding window size (512 for gemma3, 0=unlimited) */
+    float   rope_local_base_freq; /* RoPE base for local/sliding layers */
+    int32_t n_norms_per_block;/* 2 for qwen35, 4 for gemma3 */
+    float   query_pre_attn_scalar; /* attention scaling (0=use head_dim) */
+
     /* Padding to 512 bytes.
-     * With pack(1): 8+32+8+16+12+8+32+260 = 376 used, 136 pad */
-    uint8_t _pad[136];
+     * With pack(1): 376 + 20 = 396 used, 116 pad */
+    uint8_t _pad[116];
 } tqm_header_t;
 #pragma pack(pop)
 
@@ -338,6 +360,7 @@ void tq_rmsnorm(float* out, const float* x, const float* weight, int n, float ep
 void tq_rope(float* q, float* k, int pos, int head_dim,
              int n_heads, int n_kv_heads, float freq_base);
 void tq_silu(float* x, int n);
+void tq_gelu_tanh(float* x, int n);
 void tq_softmax(float* x, int n);
 void tq_add(float* out, const float* a, const float* b, int n);
 void tq_mul(float* out, const float* a, const float* b, int n);
 
@@ -0,0 +1,37 @@
+## TurboQuant.cpp v0.1.0 — First Release
+
+Pure C LLM inference engine with KV cache compression. Matches llama.cpp single-thread speed.
+
+### Highlights
+
+- **82 tok/s peak** on Qwen3.5-0.8B (Q4, CPU-only, Apple Silicon)
+- **51 tok/s single-thread** — on par with llama.cpp (50.7 tok/s)
+- **7.5x KV cache compression** with 0.999 cosine similarity
+- **8 quantization types**: Uniform, Mixed, PolarQuant, QJL, TurboQuant
+- **TQM format**: pre-quantized binary model, mmap instant load (0.3s)
+- **Zero dependencies**: libc only, ~1MB binary
+- **One-command quickstart**: `bash scripts/quickstart.sh`
+
+### What's Included
+
+- Complete inference engine: DeltaNet + Self-Attention hybrid (Qwen3.5)
+- BPE tokenizer (248K vocab, embedded in TQM)
+- Q4 weight quantization with NEON 2-row batching
+- Thread pool with zero-overhead dispatch
+- Integer Q4×Q8 attention (ARM vdotq_s32)
+- 19 test suites, 135 tests
+- Python bindings (ctypes)
+- llama.cpp / vLLM integration stubs
+
+### Quick Start
+
+```bash
+git clone https://github.com/quantumaikr/TurboQuant.cpp && cd TurboQuant.cpp
+bash scripts/quickstart.sh "What is deep learning?"
+```
+
+### References
+
+- [TurboQuant](https://arxiv.org/abs/2504.19874) (ICLR 2026)
+- [QJL](https://arxiv.org/abs/2406.03482) (AAAI 2025)
+- [PolarQuant](https://arxiv.org/abs/2502.02617) (AISTATS 2026)
@@ -880,17 +880,42 @@ static tq_model_t* tq_load_safetensors(const char* path) {
         model->config.intermediate_dim = model->config.hidden_dim * 4;
     }
 
+    /* Detect Gemma3 architecture by presence of pre_feedforward_layernorm */
+    {
+        snprintf(name_buf, sizeof(name_buf),
+                 "model.layers.0.pre_feedforward_layernorm.weight");
+        tensor_info_t* gemma3_probe = find_tensor(tensors, n_tensors, name_buf);
+        if (gemma3_probe) {
+            model->config.model_type = 1; /* gemma3 */
+            model->config.n_norms_per_block = 4;
+            fprintf(stderr, "tq_load_model: detected Gemma3 architecture (4 norms per block)\n");
+        } else {
+            model->config.model_type = 0; /* qwen35 */
+            model->config.n_norms_per_block = 2;
+        }
+    }
+
     /* Defaults — tuned for Qwen3.5 if DeltaNet detected */
     model->config.max_seq_len = 4096;
-    if (model->config.delta_n_heads > 0) {
+    if (model->config.model_type == 1) {
+        /* Gemma3: rope_theta=1M for global, 10K for local, rms_norm_eps=1e-6 */
+        model->config.rope_freq_base = 1000000.0f; /* global layers */
+        model->config.rope_local_base_freq = 10000.0f; /* sliding/local layers */
+        model->config.rms_norm_eps = 1e-6f;
+        model->config.partial_rotary_factor = 0.0f;
+        model->config.sliding_window = 512;
+        model->config.query_pre_attn_scalar = 256.0f;
+    } else if (model->config.delta_n_heads > 0) {
         /* Qwen3.5 uses rope_theta=10M, rms_norm_eps=1e-6, partial_rotary=0.25 */
         model->config.rope_freq_base = 10000000.0f;
         model->config.rms_norm_eps = 1e-6f;
         model->config.partial_rotary_factor = 0.25f;
+        model->config.query_pre_attn_scalar = 0.0f;
     } else {
         model->config.rope_freq_base = 10000.0f;
         model->config.rms_norm_eps = 1e-5f;
         model->config.partial_rotary_factor = 0.0f;
+        model->config.query_pre_attn_scalar = 0.0f;
     }
 
     /* Allocate layer weight pointers */
@@ -917,13 +942,32 @@ static tq_model_t* tq_load_safetensors(const char* path) {
                                         find_tensor(tensors, n_tensors, name_buf),
                                         &conv_buf, &conv_used, conv_capacity);
 
-        /* FFN norm */
+        /* FFN norm (Qwen3.5: post_attention_layernorm used as pre-FFN norm) */
         snprintf(name_buf, sizeof(name_buf),
                  "model.layers.%d.post_attention_layernorm.weight", l);
         layer->ffn_norm = load_tensor(data_base,
                                        find_tensor(tensors, n_tensors, name_buf),
                                        &conv_buf, &conv_used, conv_capacity);
 
+        /* Gemma3 extra norms: post_attn, pre_ffn, post_ffn */
+        if (model->config.model_type == 1) {
+            /* For Gemma3, post_attention_layernorm is applied to attn output,
+             * not as pre-FFN norm. Store it in post_attn_norm. */
+            layer->post_attn_norm = layer->ffn_norm;
+
+            snprintf(name_buf, sizeof(name_buf),
+                     "model.layers.%d.pre_feedforward_layernorm.weight", l);
+            layer->pre_ffn_norm = load_tensor(data_base,
+                                               find_tensor(tensors, n_tensors, name_buf),
+                                               &conv_buf, &conv_used, conv_capacity);
+
+            snprintf(name_buf, sizeof(name_buf),
+                     "model.layers.%d.post_feedforward_layernorm.weight", l);
+            layer->post_ffn_norm = load_tensor(data_base,
+                                                find_tensor(tensors, n_tensors, name_buf),
+                                                &conv_buf, &conv_used, conv_capacity);
+        }
+
         /* Q, K, V, O projections — only exist for self_attn layers */
         snprintf(name_buf, sizeof(name_buf),
                  "model.layers.%d.self_attn.q_proj.weight", l);
@@ -1107,6 +1151,77 @@ static tq_model_t* tq_load_safetensors(const char* path) {
         fprintf(stderr, "tq_load_model: applied Qwen3.5 RMSNorm +1 weight adjustment\n");
     }
 
+    /* Gemma3 RMSNorm adjustment: same (1+w) scaling as Qwen3.5 */
+    if (model->config.model_type == 1) {
+        int dim_h = model->config.hidden_dim;
+        int head_dim_h = model->config.head_dim;
+
+        for (int l = 0; l < n_layers; l++) {
+            tq_layer_weights_t* layer_w = &model->layers[l];
+            if (layer_w->attn_norm) {
+                for (int i = 0; i < dim_h; i++) {
+                    layer_w->attn_norm[i] += 1.0f;
+                }
+            }
+            if (layer_w->post_attn_norm) {
+                for (int i = 0; i < dim_h; i++) {
+                    layer_w->post_attn_norm[i] += 1.0f;
+                }
+            }
+            if (layer_w->pre_ffn_norm) {
+                for (int i = 0; i < dim_h; i++) {
+                    layer_w->pre_ffn_norm[i] += 1.0f;
+                }
+            }
+            if (layer_w->post_ffn_norm) {
+                for (int i = 0; i < dim_h; i++) {
+                    layer_w->post_ffn_norm[i] += 1.0f;
+                }
+            }
+            if (layer_w->q_norm) {
+                for (int i = 0; i < head_dim_h; i++) {
+                    layer_w->q_norm[i] += 1.0f;
+                }
+            }
+            if (layer_w->k_norm) {
+                for (int i = 0; i < head_dim_h; i++) {
+                    layer_w->k_norm[i] += 1.0f;
+                }
+            }
+        }
+        if (model->output_norm) {
+            for (int i = 0; i < dim_h; i++) {
+                model->output_norm[i] += 1.0f;
+            }
+        }
+        fprintf(stderr, "tq_load_model: applied Gemma3 RMSNorm +1 weight adjustment\n");
+
+        /* Set up layer_is_sliding for Gemma3.
+         * Pattern: 5 sliding + 1 full, repeated. Layers 0-4=sliding, 5=full, etc.
+         * We detect by checking layer count modulo 6. */
+        model->layer_is_sliding = (int*)calloc((size_t)n_layers, sizeof(int));
+        if (model->layer_is_sliding) {
+            for (int l = 0; l < n_layers; l++) {
+                /* Full/global attention every 6th layer (indices 5, 11, 17, ...) */
+                if ((l + 1) % 6 == 0) {
+                    model->layer_is_sliding[l] = 0; /* global */
+                } else {
+                    model->layer_is_sliding[l] = 1; /* sliding */
+                }
+            }
+            int n_sliding = 0, n_global = 0;
+            for (int l = 0; l < n_layers; l++) {
+                if (model->layer_is_sliding[l]) {
+                    n_sliding++;
+                } else {
+                    n_global++;
+                }
+            }
+            fprintf(stderr, "tq_load_model: Gemma3 layer types: %d sliding, %d global\n",
+                    n_sliding, n_global);
+        }
+    }
+
     fprintf(stderr, "tq_load_model: loaded %d layers (%d with self_attn), "
             "dim=%d, heads=%d/%d, vocab=%d\n",
             model->config.n_layers, model->n_attn_layers,
@@ -1679,6 +1794,13 @@ tq_model_t* tq_load_tqm(const char* path) {
     c->use_qk_norm         = hdr->use_qk_norm;
     c->attn_output_gate    = hdr->attn_output_gate;
 
+    /* Multi-architecture fields */
+    c->model_type              = hdr->model_type;
+    c->sliding_window          = hdr->sliding_window;
+    c->rope_local_base_freq    = hdr->rope_local_base_freq;
+    c->n_norms_per_block       = hdr->n_norms_per_block;
+    c->query_pre_attn_scalar   = hdr->query_pre_attn_scalar;
+
     /* Attn layer indices */
     model->n_attn_layers = hdr->n_attn_layers;
     if (hdr->n_attn_layers > 0) {
@@ -1748,6 +1870,13 @@ tq_model_t* tq_load_tqm(const char* path) {
         TQM_READ_FP32(layer->attn_norm, dim);
         TQM_READ_FP32(layer->ffn_norm, dim);
 
+        /* Gemma3 extra norms */
+        if (c->model_type == 1) {
+            layer->post_attn_norm = layer->ffn_norm; /* shares storage */
+            TQM_READ_FP32(layer->pre_ffn_norm, dim);
+            TQM_READ_FP32(layer->post_ffn_norm, dim);
+        }
+
         if (is_attn_layer && is_attn_layer[l]) {
             /* Self-attention layer */
             TQM_READ_Q4(layer->wq_q4, layer->wq_q4s, qg_dim, dim);
@@ -1814,6 +1943,20 @@ tq_model_t* tq_load_tqm(const char* path) {
     model->use_q4_weights = 1;
     free(is_attn_layer);
 
+    /* Set up Gemma3 layer_is_sliding from TQM */
+    if (c->model_type == 1 && c->sliding_window > 0) {
+        model->layer_is_sliding = (int*)calloc((size_t)c->n_layers, sizeof(int));
+        if (model->layer_is_sliding) {
+            for (int l = 0; l < c->n_layers; l++) {
+                if ((l + 1) % 6 == 0) {
+                    model->layer_is_sliding[l] = 0; /* global */
+                } else {
+                    model->layer_is_sliding[l] = 1; /* sliding */
+                }
+            }
+        }
+    }
+
     /* Runtime Q4 quantization of lm_head (output projection) for fast logit computation.
      * BF16 matmul on 248K x 1024 is slow; Q4 matmul is ~4x faster. */
     if (model->output_weight_bf16) {
@@ -1982,6 +2125,12 @@ int tq_save_tqm(tq_model_t* model, const char* tokenizer_path,
     hdr.use_qk_norm         = c->use_qk_norm;
     hdr.attn_output_gate    = c->attn_output_gate;
 
+    hdr.model_type              = c->model_type;
+    hdr.sliding_window          = c->sliding_window;
+    hdr.rope_local_base_freq    = c->rope_local_base_freq;
+    hdr.n_norms_per_block       = c->n_norms_per_block;
+    hdr.query_pre_attn_scalar   = c->query_pre_attn_scalar;
+
     hdr.weight_quant = 4; /* Q4 */
     hdr.embed_format = 16; /* BF16 */
 
@@ -2041,6 +2190,12 @@ int tq_save_tqm(tq_model_t* model, const char* tokenizer_path,
         TQM_WRITE_FP32(layer->attn_norm, dim);
         TQM_WRITE_FP32(layer->ffn_norm, dim);
 
+        /* Gemma3 extra norms */
+        if (c->model_type == 1) {
+            TQM_WRITE_FP32(layer->pre_ffn_norm, dim);
+            TQM_WRITE_FP32(layer->post_ffn_norm, dim);
+        }
+
         if (is_attn_layer[l]) {
             TQM_WRITE_Q4(layer->wq_q4, layer->wq_q4s, qg_dim, dim);
             TQM_WRITE_Q4(layer->wk_q4, layer->wk_q4s, kv_dim, dim);
@@ -2144,6 +2299,7 @@ void tq_free_model(tq_model_t* model) {
     free(model->_q8_data);
     free(model->_q4_data);
     free(model->attn_layer_indices);
+    free(model->layer_is_sliding);
     free(model->layers);
     free(model);
 }
@@ -1060,6 +1060,19 @@ void tq_silu(float* x, int n) {
 #endif
 }
 
+/* ============================================================
+ * GELU with tanh approximation (Gemma3 GeGLU activation)
+ * gelu_tanh(x) = 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
+ * ============================================================ */
+void tq_gelu_tanh(float* x, int n) {
+    for (int i = 0; i < n; i++) {
+        float xi = x[i];
+        float x3 = xi * xi * xi;
+        float inner = 0.7978845608f * (xi + 0.044715f * x3);
+        x[i] = 0.5f * xi * (1.0f + tanhf(inner));
+    }
+}
+
 /* ============================================================
  * Softmax: numerically stable with max subtraction
  * ============================================================ */
-Original file line number
+Diff line change
 refs/.venv/
 .cache/
 *.tqm
 -.venv/
 +.venv/
 +refs/