@@ -1336,6 +1336,227 @@ void tq_quantize_weights(tq_model_t* model) {
13361336 used / (1024 * 1024 ), used * 4 / (1024 * 1024 ));
13371337}
13381338
1339+ /* ============================================================
1340+ * Q4_0 weight quantization — quantize all layer weights post-load
1341+ *
1342+ * Converts FP32 weight matrices to Q4_0 (packed 4-bit + per-block float scale,
1343+ * block_size=32). This reduces memory ~7x: FP32 uses 4 bytes/value,
1344+ * Q4_0 uses 0.5 byte + 4 bytes/32 = 0.625 bytes/value.
1345+ *
1346+ * Each weight matrix [rows, cols] gets:
1347+ * - uint8_t qs[rows * (cols/32) * 16] — packed 4-bit values (2 per byte)
1348+ * - float scales[rows * (cols/32)] — per-block scales
1349+ *
1350+ * After quantization, the original FP32 pointer is set to NULL.
1351+ * ============================================================ */
1352+
1353+ /* Helper: quantize a single weight matrix to Q4 and store into pre-allocated buffer */
1354+ static void quantize_matrix_q4 (const float * src , int rows , int cols ,
1355+ uint8_t * * out_qs , float * * out_scales ,
1356+ char * * buf , size_t * used ) {
1357+ if (!src || rows <= 0 || cols <= 0 ) {
1358+ * out_qs = NULL ;
1359+ * out_scales = NULL ;
1360+ return ;
1361+ }
1362+ int n_blocks_per_row = (cols + 31 ) / 32 ;
1363+ size_t qs_bytes = (size_t )rows * n_blocks_per_row * 16 ; /* 16 packed bytes per block */
1364+ size_t sc_bytes = (size_t )rows * n_blocks_per_row * sizeof (float );
1365+
1366+ uint8_t * qs = (uint8_t * )(* buf + * used );
1367+ * used += qs_bytes ;
1368+ float * sc = (float * )(* buf + * used );
1369+ * used += sc_bytes ;
1370+
1371+ for (int r = 0 ; r < rows ; r ++ ) {
1372+ tq_quantize_row_q4 (src + (size_t )r * cols ,
1373+ qs + (size_t )r * n_blocks_per_row * 16 ,
1374+ sc + (size_t )r * n_blocks_per_row ,
1375+ cols );
1376+ }
1377+ * out_qs = qs ;
1378+ * out_scales = sc ;
1379+ }
1380+
1381+ /* Calculate total Q4 buffer size needed for all layer weights */
1382+ static size_t calc_q4_buffer_size (const tq_model_t * model ) {
1383+ size_t total = 0 ;
1384+ const tq_model_config_t * c = & model -> config ;
1385+ int dim = c -> hidden_dim ;
1386+ int q_dim = c -> n_heads * c -> head_dim ;
1387+ int kv_dim = c -> n_kv_heads * c -> head_dim ;
1388+ int inter = c -> intermediate_dim ;
1389+ int qg_dim = c -> attn_output_gate ? q_dim * 2 : q_dim ;
1390+
1391+ /* DeltaNet dimensions */
1392+ int delta_qkv_dim = 3 * c -> delta_n_heads * c -> delta_key_head_dim ;
1393+ int delta_z_dim = c -> delta_n_heads * c -> delta_value_head_dim ;
1394+ int delta_dn = c -> delta_n_heads ;
1395+
1396+ for (int l = 0 ; l < c -> n_layers ; l ++ ) {
1397+ const tq_layer_weights_t * layer = & model -> layers [l ];
1398+
1399+ /* Self-attention weights */
1400+ if (layer -> wq ) {
1401+ int nb = (dim + 31 ) / 32 ;
1402+ total += (size_t )qg_dim * nb * 16 ; /* packed Q4 data */
1403+ total += (size_t )qg_dim * nb * 4 ; /* float scales */
1404+ }
1405+ if (layer -> wk ) {
1406+ int nb = (dim + 31 ) / 32 ;
1407+ total += (size_t )kv_dim * nb * 16 ;
1408+ total += (size_t )kv_dim * nb * 4 ;
1409+ }
1410+ if (layer -> wv ) {
1411+ int nb = (dim + 31 ) / 32 ;
1412+ total += (size_t )kv_dim * nb * 16 ;
1413+ total += (size_t )kv_dim * nb * 4 ;
1414+ }
1415+ if (layer -> wo ) {
1416+ int nb = (q_dim + 31 ) / 32 ;
1417+ total += (size_t )dim * nb * 16 ;
1418+ total += (size_t )dim * nb * 4 ;
1419+ }
1420+
1421+ /* FFN weights */
1422+ if (layer -> w_gate ) {
1423+ int nb = (dim + 31 ) / 32 ;
1424+ total += (size_t )inter * nb * 16 ;
1425+ total += (size_t )inter * nb * 4 ;
1426+ }
1427+ if (layer -> w_up ) {
1428+ int nb = (dim + 31 ) / 32 ;
1429+ total += (size_t )inter * nb * 16 ;
1430+ total += (size_t )inter * nb * 4 ;
1431+ }
1432+ if (layer -> w_down ) {
1433+ int nb = (inter + 31 ) / 32 ;
1434+ total += (size_t )dim * nb * 16 ;
1435+ total += (size_t )dim * nb * 4 ;
1436+ }
1437+
1438+ /* DeltaNet weights */
1439+ if (layer -> delta_in_proj_qkv ) {
1440+ int nb = (dim + 31 ) / 32 ;
1441+ total += (size_t )delta_qkv_dim * nb * 16 ;
1442+ total += (size_t )delta_qkv_dim * nb * 4 ;
1443+ }
1444+ if (layer -> delta_in_proj_z ) {
1445+ int nb = (dim + 31 ) / 32 ;
1446+ total += (size_t )delta_z_dim * nb * 16 ;
1447+ total += (size_t )delta_z_dim * nb * 4 ;
1448+ }
1449+ if (layer -> delta_in_proj_a ) {
1450+ int nb = (dim + 31 ) / 32 ;
1451+ total += (size_t )delta_dn * nb * 16 ;
1452+ total += (size_t )delta_dn * nb * 4 ;
1453+ }
1454+ if (layer -> delta_in_proj_b ) {
1455+ int nb = (dim + 31 ) / 32 ;
1456+ total += (size_t )delta_dn * nb * 16 ;
1457+ total += (size_t )delta_dn * nb * 4 ;
1458+ }
1459+ if (layer -> delta_out_proj ) {
1460+ int nb = (delta_z_dim + 31 ) / 32 ;
1461+ total += (size_t )dim * nb * 16 ;
1462+ total += (size_t )dim * nb * 4 ;
1463+ }
1464+ }
1465+ return total ;
1466+ }
1467+
1468+ void tq_quantize_weights_q4 (tq_model_t * model ) {
1469+ if (!model || model -> use_q4_weights ) return ;
1470+
1471+ const tq_model_config_t * c = & model -> config ;
1472+ int dim = c -> hidden_dim ;
1473+ int q_dim = c -> n_heads * c -> head_dim ;
1474+ int kv_dim = c -> n_kv_heads * c -> head_dim ;
1475+ int inter = c -> intermediate_dim ;
1476+ int qg_dim = c -> attn_output_gate ? q_dim * 2 : q_dim ;
1477+
1478+ /* DeltaNet dimensions */
1479+ int delta_qkv_dim = 3 * c -> delta_n_heads * c -> delta_key_head_dim ;
1480+ int delta_z_dim = c -> delta_n_heads * c -> delta_value_head_dim ;
1481+ int delta_dn = c -> delta_n_heads ;
1482+
1483+ size_t buf_size = calc_q4_buffer_size (model );
1484+ char * buf = (char * )malloc (buf_size );
1485+ if (!buf ) {
1486+ fprintf (stderr , "tq_quantize_weights_q4: failed to allocate %zu MB for Q4\n" ,
1487+ buf_size / (1024 * 1024 ));
1488+ return ;
1489+ }
1490+ size_t used = 0 ;
1491+
1492+ for (int l = 0 ; l < c -> n_layers ; l ++ ) {
1493+ tq_layer_weights_t * layer = & model -> layers [l ];
1494+
1495+ /* Self-attention */
1496+ quantize_matrix_q4 (layer -> wq , qg_dim , dim ,
1497+ & layer -> wq_q4 , & layer -> wq_q4s , & buf , & used );
1498+ if (layer -> wq_q4 ) layer -> wq = NULL ;
1499+
1500+ quantize_matrix_q4 (layer -> wk , kv_dim , dim ,
1501+ & layer -> wk_q4 , & layer -> wk_q4s , & buf , & used );
1502+ if (layer -> wk_q4 ) layer -> wk = NULL ;
1503+
1504+ quantize_matrix_q4 (layer -> wv , kv_dim , dim ,
1505+ & layer -> wv_q4 , & layer -> wv_q4s , & buf , & used );
1506+ if (layer -> wv_q4 ) layer -> wv = NULL ;
1507+
1508+ quantize_matrix_q4 (layer -> wo , dim , q_dim ,
1509+ & layer -> wo_q4 , & layer -> wo_q4s , & buf , & used );
1510+ if (layer -> wo_q4 ) layer -> wo = NULL ;
1511+
1512+ /* FFN */
1513+ quantize_matrix_q4 (layer -> w_gate , inter , dim ,
1514+ & layer -> w_gate_q4 , & layer -> w_gate_q4s , & buf , & used );
1515+ if (layer -> w_gate_q4 ) layer -> w_gate = NULL ;
1516+
1517+ quantize_matrix_q4 (layer -> w_up , inter , dim ,
1518+ & layer -> w_up_q4 , & layer -> w_up_q4s , & buf , & used );
1519+ if (layer -> w_up_q4 ) layer -> w_up = NULL ;
1520+
1521+ quantize_matrix_q4 (layer -> w_down , dim , inter ,
1522+ & layer -> w_down_q4 , & layer -> w_down_q4s , & buf , & used );
1523+ if (layer -> w_down_q4 ) layer -> w_down = NULL ;
1524+
1525+ /* DeltaNet */
1526+ quantize_matrix_q4 (layer -> delta_in_proj_qkv , delta_qkv_dim , dim ,
1527+ & layer -> delta_in_proj_qkv_q4 , & layer -> delta_in_proj_qkv_q4s ,
1528+ & buf , & used );
1529+ if (layer -> delta_in_proj_qkv_q4 ) layer -> delta_in_proj_qkv = NULL ;
1530+
1531+ quantize_matrix_q4 (layer -> delta_in_proj_z , delta_z_dim , dim ,
1532+ & layer -> delta_in_proj_z_q4 , & layer -> delta_in_proj_z_q4s ,
1533+ & buf , & used );
1534+ if (layer -> delta_in_proj_z_q4 ) layer -> delta_in_proj_z = NULL ;
1535+
1536+ quantize_matrix_q4 (layer -> delta_in_proj_a , delta_dn , dim ,
1537+ & layer -> delta_in_proj_a_q4 , & layer -> delta_in_proj_a_q4s ,
1538+ & buf , & used );
1539+ if (layer -> delta_in_proj_a_q4 ) layer -> delta_in_proj_a = NULL ;
1540+
1541+ quantize_matrix_q4 (layer -> delta_in_proj_b , delta_dn , dim ,
1542+ & layer -> delta_in_proj_b_q4 , & layer -> delta_in_proj_b_q4s ,
1543+ & buf , & used );
1544+ if (layer -> delta_in_proj_b_q4 ) layer -> delta_in_proj_b = NULL ;
1545+
1546+ quantize_matrix_q4 (layer -> delta_out_proj , dim , delta_z_dim ,
1547+ & layer -> delta_out_proj_q4 , & layer -> delta_out_proj_q4s ,
1548+ & buf , & used );
1549+ if (layer -> delta_out_proj_q4 ) layer -> delta_out_proj = NULL ;
1550+ }
1551+
1552+ model -> use_q4_weights = 1 ;
1553+ model -> _q4_data = buf ;
1554+ model -> _q4_size = used ;
1555+
1556+ fprintf (stderr , "tq_quantize_weights_q4: quantized to Q4 (%zu MB, was ~%zu MB FP32)\n" ,
1557+ used / (1024 * 1024 ), used * 8 / (1024 * 1024 ));
1558+ }
1559+
13391560/* ============================================================
13401561 * Free model
13411562 * ============================================================ */
@@ -1350,6 +1571,7 @@ void tq_free_model(tq_model_t* model) {
13501571
13511572 free (model -> _converted_data );
13521573 free (model -> _q8_data );
1574+ free (model -> _q4_data );
13531575 free (model -> attn_layer_indices );
13541576 free (model -> layers );
13551577 free (model );
0 commit comments