Skip to content

Commit 8348162

Browse files
unamedkrclaude
andcommitted
v0.8.2: Tokenizer + DeltaNet + full hybrid forward pass
Tokenizer (tokenizer.json → BPE encode/decode): - Parses HuggingFace BPE format (248K vocab, 247K merges) - GPT2/Qwen byte-level BPE encoding with iterative pair merging - Verified: "Hello" → token 9419 → decode "Hello" ✓ DeltaNet (Gated DeltaNet linear attention): - 18/24 layers in Qwen3.5-0.8B use DeltaNet (recurrent, not attention) - Implements: QKV projection, causal conv1d, L2 normalize, delta rule recurrent update, per-head RMSNorm, swish gating, output projection - All 10 weight tensors per layer loaded from safetensors Self-Attention (Qwen3.5-specific): - QK-norm (pre-attention normalization) - Partial RoPE (25% of head_dim rotated) - Output gate (sigmoid gating on attention output) Model loader enhanced: - Detects DeltaNet config from tensor shapes - Loads QK-norm, output gate weights - Sets Qwen3.5 defaults (rope_freq_base=10M, partial_rotary=0.25) Status: Model loads, tokenizer works, forward pass runs without crash. Output quality needs tuning (DeltaNet weight ordering/normalization). 19/19 tests pass. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 6413e4c commit 8348162

5 files changed

Lines changed: 1556 additions & 299 deletions

File tree

include/turboquant/tq_engine.h

Lines changed: 52 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -16,28 +16,57 @@ typedef struct {
1616
int n_layers;
1717
int hidden_dim;
1818
int intermediate_dim;
19-
int n_heads; /* query heads */
20-
int n_kv_heads; /* KV heads (GQA) */
21-
int head_dim;
19+
int n_heads; /* query heads (for self_attn layers) */
20+
int n_kv_heads; /* KV heads (GQA, for self_attn layers) */
21+
int head_dim; /* head dimension for self_attn */
2222
int vocab_size;
2323
int max_seq_len;
2424
float rope_freq_base;
2525
float rms_norm_eps;
26+
27+
/* DeltaNet (linear_attention) config */
28+
int delta_n_heads; /* number of DeltaNet heads (e.g., 16) */
29+
int delta_key_head_dim; /* key head dim (e.g., 128) */
30+
int delta_value_head_dim;/* value head dim (e.g., 128) */
31+
int delta_conv_width; /* conv1d kernel width (e.g., 4) */
32+
float partial_rotary_factor; /* fraction of head_dim that uses RoPE (e.g., 0.25) */
33+
34+
/* QK-norm for self_attn (Qwen3.5 style) */
35+
int use_qk_norm; /* 1 if q_norm/k_norm weights present */
36+
int attn_output_gate; /* 1 if q_proj includes output gate (doubled q_proj output) */
2637
} tq_model_config_t;
2738

2839
/* ============================================================
2940
* Model weights (in memory)
3041
* ============================================================ */
3142
typedef struct {
32-
float* attn_norm; /* [hidden_dim] */
33-
float* ffn_norm; /* [hidden_dim] */
43+
/* RMSNorm weights */
44+
float* attn_norm; /* [hidden_dim] input_layernorm */
45+
float* ffn_norm; /* [hidden_dim] post_attention_layernorm */
46+
47+
/* Standard self_attn weights (NULL for DeltaNet layers) */
3448
float* wq; /* [n_heads * head_dim, hidden_dim] */
3549
float* wk; /* [n_kv_heads * head_dim, hidden_dim] */
3650
float* wv; /* [n_kv_heads * head_dim, hidden_dim] */
3751
float* wo; /* [hidden_dim, n_heads * head_dim] */
52+
float* q_norm; /* [head_dim] QK-norm for queries */
53+
float* k_norm; /* [head_dim] QK-norm for keys */
54+
55+
/* SwiGLU FFN weights (present on ALL layers) */
3856
float* w_gate; /* [intermediate_dim, hidden_dim] */
3957
float* w_up; /* [intermediate_dim, hidden_dim] */
4058
float* w_down; /* [hidden_dim, intermediate_dim] */
59+
60+
/* DeltaNet (linear_attention) weights (NULL for self_attn layers) */
61+
float* delta_a_log; /* [delta_n_heads] decay parameter (log scale) */
62+
float* delta_conv1d; /* [qkv_dim, 1, conv_width] */
63+
float* delta_dt_bias; /* [delta_n_heads] delta bias */
64+
float* delta_in_proj_a; /* [delta_n_heads, hidden_dim] */
65+
float* delta_in_proj_b; /* [delta_n_heads, hidden_dim] */
66+
float* delta_in_proj_qkv; /* [qkv_dim, hidden_dim] (qkv_dim = 3 * delta_n_heads * delta_key_head_dim) */
67+
float* delta_in_proj_z; /* [z_dim, hidden_dim] (z_dim = delta_n_heads * delta_value_head_dim) */
68+
float* delta_norm; /* [delta_value_head_dim] group norm weight */
69+
float* delta_out_proj; /* [hidden_dim, z_dim] */
4170
} tq_layer_weights_t;
4271

4372
typedef struct {
@@ -80,12 +109,22 @@ typedef struct {
80109
float* hb2; /* [intermediate_dim] FFN buffer 2 */
81110
float* logits; /* [vocab_size] output logits */
82111

83-
/* KV cache — FP32 for values, quantized for keys via TurboQuant */
112+
/* KV cache for self_attn layers */
84113
float* key_cache; /* [n_layers, max_seq_len, n_kv_heads * head_dim] */
85114
float* value_cache; /* [n_layers, max_seq_len, n_kv_heads * head_dim] */
86115
tq_type kv_quant_type; /* quantization type for KV attention */
87116
size_t kv_cache_size;
88117

118+
/* DeltaNet recurrent state */
119+
float* delta_state; /* [n_layers, delta_n_heads, key_head_dim, value_head_dim] */
120+
float* conv_state; /* [n_layers, qkv_dim, conv_width-1] */
121+
122+
/* DeltaNet workspace buffers */
123+
float* delta_qkv; /* [qkv_dim] workspace for QKV projection */
124+
float* delta_z; /* [z_dim] workspace for Z gate */
125+
float* delta_ab; /* [delta_n_heads * 2] workspace for a,b projections */
126+
float* delta_out; /* [z_dim] workspace for output */
127+
89128
/* Quantization workspace */
90129
void* quant_key_buf; /* workspace for quantized keys */
91130
float* quant_score_buf; /* workspace for quantized attention scores */
@@ -109,12 +148,15 @@ typedef struct {
109148
* Tokenizer
110149
* ============================================================ */
111150
typedef struct {
112-
char** vocab; /* token strings */
113-
float* scores; /* BPE merge scores */
114-
int vocab_size;
151+
char** vocab; /* token strings, indexed by token_id */
152+
float* scores; /* BPE merge scores (merge priority) */
153+
int vocab_size; /* total vocab capacity (max_id + 1) */
115154
int max_token_len;
116-
/* Sorted vocab for encoding */
155+
int n_merges; /* number of BPE merges loaded */
156+
/* Sorted vocab for encoding (binary search by string) */
117157
int* sorted_indices;
158+
/* Merge table: pairs of token IDs that merge into a result */
159+
int* merge_pairs; /* [n_merges * 3]: (token_a, token_b, result_id) */
118160
} tq_tokenizer_t;
119161

120162
/* ============================================================

src/engine/tq_generate.c

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -178,12 +178,17 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
178178
int output_pos = 0;
179179
int prev_token = prompt_tokens[n_prompt - 1];
180180

181-
/* EOS token IDs — common values */
182-
int eos_token = 2;
181+
/* EOS token IDs — check common values.
182+
* Qwen3.5: eos = 248044 (<|endoftext|>), also 248046 (<|im_end|>)
183+
* LLaMA: eos = 2 */
184+
int eos_token1 = 2; /* LLaMA convention */
185+
int eos_token2 = 248044; /* Qwen <|endoftext|> */
186+
int eos_token3 = 248046; /* Qwen <|im_end|> */
183187

184188
/* Generate loop */
185189
while (generated < config->max_tokens) {
186-
if (next_token == eos_token) break;
190+
if (next_token == eos_token1 || next_token == eos_token2 ||
191+
next_token == eos_token3) break;
187192
if (pos >= model->config.max_seq_len) break;
188193

189194
/* Decode token to text */

src/engine/tq_model.c

Lines changed: 172 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -699,19 +699,106 @@ tq_model_t* tq_load_model(const char* path) {
699699
if (wq0 && wk0) {
700700
int q_out = (int)wq0->shape[0];
701701
int k_out = (int)wk0->shape[0];
702-
/* Common head_dim values: 64, 128 */
703-
/* Try head_dim = 128, then 64, then 96 */
704-
int head_dim = 128;
705-
if (q_out % head_dim != 0) head_dim = 64;
706-
if (q_out % head_dim != 0) head_dim = 96;
702+
703+
/* Try to detect head_dim from q_norm weight if available */
704+
snprintf(name_buf, sizeof(name_buf),
705+
"model.layers.%d.self_attn.q_norm.weight", probe_layer);
706+
tensor_info_t* qn0 = find_tensor(tensors, n_tensors, name_buf);
707+
int head_dim;
708+
if (qn0 && qn0->n_dims >= 1) {
709+
head_dim = (int)qn0->shape[0];
710+
model->config.use_qk_norm = 1;
711+
} else {
712+
/* Common head_dim values: 128, 64, 96, 256 */
713+
head_dim = 128;
714+
if (q_out % head_dim != 0) head_dim = 64;
715+
if (q_out % head_dim != 0) head_dim = 96;
716+
if (q_out % head_dim != 0) head_dim = 256;
717+
model->config.use_qk_norm = 0;
718+
}
707719
model->config.head_dim = head_dim;
708-
model->config.n_heads = q_out / head_dim;
709720
model->config.n_kv_heads = k_out / head_dim;
721+
722+
/* Detect attn_output_gate: if q_proj output is exactly 2x k_proj
723+
* output * (n_heads/n_kv_heads ratio), then q_proj includes a gate.
724+
* More precisely: q_out = n_heads * head_dim * (1 + gate).
725+
* Compare against o_proj input dim to determine n_heads. */
726+
snprintf(name_buf, sizeof(name_buf),
727+
"model.layers.%d.self_attn.o_proj.weight", probe_layer);
728+
tensor_info_t* wo0 = find_tensor(tensors, n_tensors, name_buf);
729+
if (wo0 && wo0->n_dims >= 2) {
730+
int o_in = (int)wo0->shape[1]; /* o_proj is [hidden_dim, n_heads*head_dim] */
731+
int n_heads_from_o = o_in / head_dim;
732+
if (q_out == n_heads_from_o * head_dim * 2) {
733+
/* q_proj is doubled: [Q, gate_q] */
734+
model->config.attn_output_gate = 1;
735+
model->config.n_heads = n_heads_from_o;
736+
fprintf(stderr, "tq_load_model: detected attn_output_gate=1 "
737+
"(q_proj=%d = 2 * %d * %d)\n",
738+
q_out, n_heads_from_o, head_dim);
739+
} else {
740+
model->config.attn_output_gate = 0;
741+
model->config.n_heads = q_out / head_dim;
742+
}
743+
} else {
744+
model->config.attn_output_gate = 0;
745+
model->config.n_heads = q_out / head_dim;
746+
}
710747
} else {
711748
/* Defaults for small models */
712749
model->config.head_dim = 64;
713750
model->config.n_heads = model->config.hidden_dim / 64;
714751
model->config.n_kv_heads = model->config.n_heads;
752+
model->config.use_qk_norm = 0;
753+
model->config.attn_output_gate = 0;
754+
}
755+
756+
/* Detect DeltaNet config from first linear_attn layer */
757+
model->config.delta_n_heads = 0;
758+
model->config.delta_key_head_dim = 0;
759+
model->config.delta_value_head_dim = 0;
760+
model->config.delta_conv_width = 4;
761+
model->config.partial_rotary_factor = 0.0f;
762+
{
763+
/* Find first DeltaNet layer */
764+
int delta_layer = -1;
765+
for (int l = 0; l < model->config.n_layers; l++) {
766+
snprintf(name_buf, sizeof(name_buf),
767+
"model.layers.%d.linear_attn.A_log", l);
768+
if (find_tensor(tensors, n_tensors, name_buf)) {
769+
delta_layer = l;
770+
break;
771+
}
772+
}
773+
if (delta_layer >= 0) {
774+
snprintf(name_buf, sizeof(name_buf),
775+
"model.layers.%d.linear_attn.A_log", delta_layer);
776+
tensor_info_t* a_log = find_tensor(tensors, n_tensors, name_buf);
777+
if (a_log) {
778+
model->config.delta_n_heads = (int)a_log->shape[0];
779+
}
780+
781+
snprintf(name_buf, sizeof(name_buf),
782+
"model.layers.%d.linear_attn.in_proj_qkv.weight", delta_layer);
783+
tensor_info_t* qkv_proj = find_tensor(tensors, n_tensors, name_buf);
784+
if (qkv_proj && model->config.delta_n_heads > 0) {
785+
int qkv_dim = (int)qkv_proj->shape[0];
786+
/* qkv_dim = 3 * n_heads * head_dim */
787+
model->config.delta_key_head_dim = qkv_dim / (3 * model->config.delta_n_heads);
788+
model->config.delta_value_head_dim = model->config.delta_key_head_dim;
789+
}
790+
791+
snprintf(name_buf, sizeof(name_buf),
792+
"model.layers.%d.linear_attn.conv1d.weight", delta_layer);
793+
tensor_info_t* conv = find_tensor(tensors, n_tensors, name_buf);
794+
if (conv && conv->n_dims >= 3) {
795+
model->config.delta_conv_width = (int)conv->shape[2];
796+
}
797+
798+
fprintf(stderr, "tq_load_model: DeltaNet config — %d heads, key_dim=%d, val_dim=%d, conv_w=%d\n",
799+
model->config.delta_n_heads, model->config.delta_key_head_dim,
800+
model->config.delta_value_head_dim, model->config.delta_conv_width);
801+
}
715802
}
716803

717804
/* Detect intermediate_dim from gate projection (use probe_layer) */
@@ -730,10 +817,18 @@ tq_model_t* tq_load_model(const char* path) {
730817
model->config.intermediate_dim = model->config.hidden_dim * 4;
731818
}
732819

733-
/* Defaults */
820+
/* Defaults — tuned for Qwen3.5 if DeltaNet detected */
734821
model->config.max_seq_len = 4096;
735-
model->config.rope_freq_base = 10000.0f;
736-
model->config.rms_norm_eps = 1e-5f;
822+
if (model->config.delta_n_heads > 0) {
823+
/* Qwen3.5 uses rope_theta=10M, rms_norm_eps=1e-6, partial_rotary=0.25 */
824+
model->config.rope_freq_base = 10000000.0f;
825+
model->config.rms_norm_eps = 1e-6f;
826+
model->config.partial_rotary_factor = 0.25f;
827+
} else {
828+
model->config.rope_freq_base = 10000.0f;
829+
model->config.rms_norm_eps = 1e-5f;
830+
model->config.partial_rotary_factor = 0.0f;
831+
}
737832

738833
/* Allocate layer weight pointers */
739834
int n_layers = model->config.n_layers;
@@ -791,6 +886,74 @@ tq_model_t* tq_load_model(const char* path) {
791886
find_tensor(tensors, n_tensors, name_buf),
792887
&conv_buf, &conv_used, conv_capacity);
793888

889+
/* QK-norm weights (Qwen3.5 style) */
890+
snprintf(name_buf, sizeof(name_buf),
891+
"model.layers.%d.self_attn.q_norm.weight", l);
892+
layer->q_norm = load_tensor(data_base,
893+
find_tensor(tensors, n_tensors, name_buf),
894+
&conv_buf, &conv_used, conv_capacity);
895+
896+
snprintf(name_buf, sizeof(name_buf),
897+
"model.layers.%d.self_attn.k_norm.weight", l);
898+
layer->k_norm = load_tensor(data_base,
899+
find_tensor(tensors, n_tensors, name_buf),
900+
&conv_buf, &conv_used, conv_capacity);
901+
902+
/* DeltaNet (linear_attention) weights */
903+
snprintf(name_buf, sizeof(name_buf),
904+
"model.layers.%d.linear_attn.A_log", l);
905+
layer->delta_a_log = load_tensor(data_base,
906+
find_tensor(tensors, n_tensors, name_buf),
907+
&conv_buf, &conv_used, conv_capacity);
908+
909+
snprintf(name_buf, sizeof(name_buf),
910+
"model.layers.%d.linear_attn.conv1d.weight", l);
911+
layer->delta_conv1d = load_tensor(data_base,
912+
find_tensor(tensors, n_tensors, name_buf),
913+
&conv_buf, &conv_used, conv_capacity);
914+
915+
snprintf(name_buf, sizeof(name_buf),
916+
"model.layers.%d.linear_attn.dt_bias", l);
917+
layer->delta_dt_bias = load_tensor(data_base,
918+
find_tensor(tensors, n_tensors, name_buf),
919+
&conv_buf, &conv_used, conv_capacity);
920+
921+
snprintf(name_buf, sizeof(name_buf),
922+
"model.layers.%d.linear_attn.in_proj_a.weight", l);
923+
layer->delta_in_proj_a = load_tensor(data_base,
924+
find_tensor(tensors, n_tensors, name_buf),
925+
&conv_buf, &conv_used, conv_capacity);
926+
927+
snprintf(name_buf, sizeof(name_buf),
928+
"model.layers.%d.linear_attn.in_proj_b.weight", l);
929+
layer->delta_in_proj_b = load_tensor(data_base,
930+
find_tensor(tensors, n_tensors, name_buf),
931+
&conv_buf, &conv_used, conv_capacity);
932+
933+
snprintf(name_buf, sizeof(name_buf),
934+
"model.layers.%d.linear_attn.in_proj_qkv.weight", l);
935+
layer->delta_in_proj_qkv = load_tensor(data_base,
936+
find_tensor(tensors, n_tensors, name_buf),
937+
&conv_buf, &conv_used, conv_capacity);
938+
939+
snprintf(name_buf, sizeof(name_buf),
940+
"model.layers.%d.linear_attn.in_proj_z.weight", l);
941+
layer->delta_in_proj_z = load_tensor(data_base,
942+
find_tensor(tensors, n_tensors, name_buf),
943+
&conv_buf, &conv_used, conv_capacity);
944+
945+
snprintf(name_buf, sizeof(name_buf),
946+
"model.layers.%d.linear_attn.norm.weight", l);
947+
layer->delta_norm = load_tensor(data_base,
948+
find_tensor(tensors, n_tensors, name_buf),
949+
&conv_buf, &conv_used, conv_capacity);
950+
951+
snprintf(name_buf, sizeof(name_buf),
952+
"model.layers.%d.linear_attn.out_proj.weight", l);
953+
layer->delta_out_proj = load_tensor(data_base,
954+
find_tensor(tensors, n_tensors, name_buf),
955+
&conv_buf, &conv_used, conv_capacity);
956+
794957
/* FFN: gate, up, down projections (SwiGLU) */
795958
snprintf(name_buf, sizeof(name_buf),
796959
"model.layers.%d.mlp.gate_proj.weight", l);

0 commit comments

Comments
 (0)