@@ -699,19 +699,106 @@ tq_model_t* tq_load_model(const char* path) {
699699 if (wq0 && wk0 ) {
700700 int q_out = (int )wq0 -> shape [0 ];
701701 int k_out = (int )wk0 -> shape [0 ];
702- /* Common head_dim values: 64, 128 */
703- /* Try head_dim = 128, then 64, then 96 */
704- int head_dim = 128 ;
705- if (q_out % head_dim != 0 ) head_dim = 64 ;
706- if (q_out % head_dim != 0 ) head_dim = 96 ;
702+
703+ /* Try to detect head_dim from q_norm weight if available */
704+ snprintf (name_buf , sizeof (name_buf ),
705+ "model.layers.%d.self_attn.q_norm.weight" , probe_layer );
706+ tensor_info_t * qn0 = find_tensor (tensors , n_tensors , name_buf );
707+ int head_dim ;
708+ if (qn0 && qn0 -> n_dims >= 1 ) {
709+ head_dim = (int )qn0 -> shape [0 ];
710+ model -> config .use_qk_norm = 1 ;
711+ } else {
712+ /* Common head_dim values: 128, 64, 96, 256 */
713+ head_dim = 128 ;
714+ if (q_out % head_dim != 0 ) head_dim = 64 ;
715+ if (q_out % head_dim != 0 ) head_dim = 96 ;
716+ if (q_out % head_dim != 0 ) head_dim = 256 ;
717+ model -> config .use_qk_norm = 0 ;
718+ }
707719 model -> config .head_dim = head_dim ;
708- model -> config .n_heads = q_out / head_dim ;
709720 model -> config .n_kv_heads = k_out / head_dim ;
721+
722+ /* Detect attn_output_gate: if q_proj output is exactly 2x k_proj
723+ * output * (n_heads/n_kv_heads ratio), then q_proj includes a gate.
724+ * More precisely: q_out = n_heads * head_dim * (1 + gate).
725+ * Compare against o_proj input dim to determine n_heads. */
726+ snprintf (name_buf , sizeof (name_buf ),
727+ "model.layers.%d.self_attn.o_proj.weight" , probe_layer );
728+ tensor_info_t * wo0 = find_tensor (tensors , n_tensors , name_buf );
729+ if (wo0 && wo0 -> n_dims >= 2 ) {
730+ int o_in = (int )wo0 -> shape [1 ]; /* o_proj is [hidden_dim, n_heads*head_dim] */
731+ int n_heads_from_o = o_in / head_dim ;
732+ if (q_out == n_heads_from_o * head_dim * 2 ) {
733+ /* q_proj is doubled: [Q, gate_q] */
734+ model -> config .attn_output_gate = 1 ;
735+ model -> config .n_heads = n_heads_from_o ;
736+ fprintf (stderr , "tq_load_model: detected attn_output_gate=1 "
737+ "(q_proj=%d = 2 * %d * %d)\n" ,
738+ q_out , n_heads_from_o , head_dim );
739+ } else {
740+ model -> config .attn_output_gate = 0 ;
741+ model -> config .n_heads = q_out / head_dim ;
742+ }
743+ } else {
744+ model -> config .attn_output_gate = 0 ;
745+ model -> config .n_heads = q_out / head_dim ;
746+ }
710747 } else {
711748 /* Defaults for small models */
712749 model -> config .head_dim = 64 ;
713750 model -> config .n_heads = model -> config .hidden_dim / 64 ;
714751 model -> config .n_kv_heads = model -> config .n_heads ;
752+ model -> config .use_qk_norm = 0 ;
753+ model -> config .attn_output_gate = 0 ;
754+ }
755+
756+ /* Detect DeltaNet config from first linear_attn layer */
757+ model -> config .delta_n_heads = 0 ;
758+ model -> config .delta_key_head_dim = 0 ;
759+ model -> config .delta_value_head_dim = 0 ;
760+ model -> config .delta_conv_width = 4 ;
761+ model -> config .partial_rotary_factor = 0.0f ;
762+ {
763+ /* Find first DeltaNet layer */
764+ int delta_layer = -1 ;
765+ for (int l = 0 ; l < model -> config .n_layers ; l ++ ) {
766+ snprintf (name_buf , sizeof (name_buf ),
767+ "model.layers.%d.linear_attn.A_log" , l );
768+ if (find_tensor (tensors , n_tensors , name_buf )) {
769+ delta_layer = l ;
770+ break ;
771+ }
772+ }
773+ if (delta_layer >= 0 ) {
774+ snprintf (name_buf , sizeof (name_buf ),
775+ "model.layers.%d.linear_attn.A_log" , delta_layer );
776+ tensor_info_t * a_log = find_tensor (tensors , n_tensors , name_buf );
777+ if (a_log ) {
778+ model -> config .delta_n_heads = (int )a_log -> shape [0 ];
779+ }
780+
781+ snprintf (name_buf , sizeof (name_buf ),
782+ "model.layers.%d.linear_attn.in_proj_qkv.weight" , delta_layer );
783+ tensor_info_t * qkv_proj = find_tensor (tensors , n_tensors , name_buf );
784+ if (qkv_proj && model -> config .delta_n_heads > 0 ) {
785+ int qkv_dim = (int )qkv_proj -> shape [0 ];
786+ /* qkv_dim = 3 * n_heads * head_dim */
787+ model -> config .delta_key_head_dim = qkv_dim / (3 * model -> config .delta_n_heads );
788+ model -> config .delta_value_head_dim = model -> config .delta_key_head_dim ;
789+ }
790+
791+ snprintf (name_buf , sizeof (name_buf ),
792+ "model.layers.%d.linear_attn.conv1d.weight" , delta_layer );
793+ tensor_info_t * conv = find_tensor (tensors , n_tensors , name_buf );
794+ if (conv && conv -> n_dims >= 3 ) {
795+ model -> config .delta_conv_width = (int )conv -> shape [2 ];
796+ }
797+
798+ fprintf (stderr , "tq_load_model: DeltaNet config — %d heads, key_dim=%d, val_dim=%d, conv_w=%d\n" ,
799+ model -> config .delta_n_heads , model -> config .delta_key_head_dim ,
800+ model -> config .delta_value_head_dim , model -> config .delta_conv_width );
801+ }
715802 }
716803
717804 /* Detect intermediate_dim from gate projection (use probe_layer) */
@@ -730,10 +817,18 @@ tq_model_t* tq_load_model(const char* path) {
730817 model -> config .intermediate_dim = model -> config .hidden_dim * 4 ;
731818 }
732819
733- /* Defaults */
820+ /* Defaults — tuned for Qwen3.5 if DeltaNet detected */
734821 model -> config .max_seq_len = 4096 ;
735- model -> config .rope_freq_base = 10000.0f ;
736- model -> config .rms_norm_eps = 1e-5f ;
822+ if (model -> config .delta_n_heads > 0 ) {
823+ /* Qwen3.5 uses rope_theta=10M, rms_norm_eps=1e-6, partial_rotary=0.25 */
824+ model -> config .rope_freq_base = 10000000.0f ;
825+ model -> config .rms_norm_eps = 1e-6f ;
826+ model -> config .partial_rotary_factor = 0.25f ;
827+ } else {
828+ model -> config .rope_freq_base = 10000.0f ;
829+ model -> config .rms_norm_eps = 1e-5f ;
830+ model -> config .partial_rotary_factor = 0.0f ;
831+ }
737832
738833 /* Allocate layer weight pointers */
739834 int n_layers = model -> config .n_layers ;
@@ -791,6 +886,74 @@ tq_model_t* tq_load_model(const char* path) {
791886 find_tensor (tensors , n_tensors , name_buf ),
792887 & conv_buf , & conv_used , conv_capacity );
793888
889+ /* QK-norm weights (Qwen3.5 style) */
890+ snprintf (name_buf , sizeof (name_buf ),
891+ "model.layers.%d.self_attn.q_norm.weight" , l );
892+ layer -> q_norm = load_tensor (data_base ,
893+ find_tensor (tensors , n_tensors , name_buf ),
894+ & conv_buf , & conv_used , conv_capacity );
895+
896+ snprintf (name_buf , sizeof (name_buf ),
897+ "model.layers.%d.self_attn.k_norm.weight" , l );
898+ layer -> k_norm = load_tensor (data_base ,
899+ find_tensor (tensors , n_tensors , name_buf ),
900+ & conv_buf , & conv_used , conv_capacity );
901+
902+ /* DeltaNet (linear_attention) weights */
903+ snprintf (name_buf , sizeof (name_buf ),
904+ "model.layers.%d.linear_attn.A_log" , l );
905+ layer -> delta_a_log = load_tensor (data_base ,
906+ find_tensor (tensors , n_tensors , name_buf ),
907+ & conv_buf , & conv_used , conv_capacity );
908+
909+ snprintf (name_buf , sizeof (name_buf ),
910+ "model.layers.%d.linear_attn.conv1d.weight" , l );
911+ layer -> delta_conv1d = load_tensor (data_base ,
912+ find_tensor (tensors , n_tensors , name_buf ),
913+ & conv_buf , & conv_used , conv_capacity );
914+
915+ snprintf (name_buf , sizeof (name_buf ),
916+ "model.layers.%d.linear_attn.dt_bias" , l );
917+ layer -> delta_dt_bias = load_tensor (data_base ,
918+ find_tensor (tensors , n_tensors , name_buf ),
919+ & conv_buf , & conv_used , conv_capacity );
920+
921+ snprintf (name_buf , sizeof (name_buf ),
922+ "model.layers.%d.linear_attn.in_proj_a.weight" , l );
923+ layer -> delta_in_proj_a = load_tensor (data_base ,
924+ find_tensor (tensors , n_tensors , name_buf ),
925+ & conv_buf , & conv_used , conv_capacity );
926+
927+ snprintf (name_buf , sizeof (name_buf ),
928+ "model.layers.%d.linear_attn.in_proj_b.weight" , l );
929+ layer -> delta_in_proj_b = load_tensor (data_base ,
930+ find_tensor (tensors , n_tensors , name_buf ),
931+ & conv_buf , & conv_used , conv_capacity );
932+
933+ snprintf (name_buf , sizeof (name_buf ),
934+ "model.layers.%d.linear_attn.in_proj_qkv.weight" , l );
935+ layer -> delta_in_proj_qkv = load_tensor (data_base ,
936+ find_tensor (tensors , n_tensors , name_buf ),
937+ & conv_buf , & conv_used , conv_capacity );
938+
939+ snprintf (name_buf , sizeof (name_buf ),
940+ "model.layers.%d.linear_attn.in_proj_z.weight" , l );
941+ layer -> delta_in_proj_z = load_tensor (data_base ,
942+ find_tensor (tensors , n_tensors , name_buf ),
943+ & conv_buf , & conv_used , conv_capacity );
944+
945+ snprintf (name_buf , sizeof (name_buf ),
946+ "model.layers.%d.linear_attn.norm.weight" , l );
947+ layer -> delta_norm = load_tensor (data_base ,
948+ find_tensor (tensors , n_tensors , name_buf ),
949+ & conv_buf , & conv_used , conv_capacity );
950+
951+ snprintf (name_buf , sizeof (name_buf ),
952+ "model.layers.%d.linear_attn.out_proj.weight" , l );
953+ layer -> delta_out_proj = load_tensor (data_base ,
954+ find_tensor (tensors , n_tensors , name_buf ),
955+ & conv_buf , & conv_used , conv_capacity );
956+
794957 /* FFN: gate, up, down projections (SwiGLU) */
795958 snprintf (name_buf , sizeof (name_buf ),
796959 "model.layers.%d.mlp.gate_proj.weight" , l );
0 commit comments