@@ -1107,6 +1107,235 @@ tq_model_t* tq_load_model(const char* path) {
11071107 return NULL ;
11081108}
11091109
1110+ /* ============================================================
1111+ * Q8 weight quantization — quantize all layer weights post-load
1112+ *
1113+ * Converts FP32 weight matrices to Q8 (int8 + per-block float scale,
1114+ * block_size=32). This halves memory: FP32 uses 4 bytes/value,
1115+ * Q8 uses 1 byte + 4 bytes/32 = 1.125 bytes/value.
1116+ *
1117+ * Each weight matrix [rows, cols] gets:
1118+ * - int8_t q8[rows * cols] — quantized values
1119+ * - float scales[rows * (cols/32)] — per-block scales
1120+ *
1121+ * After quantization, the original FP32 pointer is set to NULL
1122+ * (it either pointed into mmap or conversion buffer, both still alive).
1123+ * ============================================================ */
1124+
1125+ /* Helper: quantize a single weight matrix and store into pre-allocated buffer */
1126+ static void quantize_matrix_q8 (const float * src , int rows , int cols ,
1127+ int8_t * * out_qs , float * * out_scales ,
1128+ char * * buf , size_t * used ) {
1129+ if (!src || rows <= 0 || cols <= 0 ) {
1130+ * out_qs = NULL ;
1131+ * out_scales = NULL ;
1132+ return ;
1133+ }
1134+ int n_blocks_per_row = (cols + 31 ) / 32 ;
1135+ size_t qs_bytes = (size_t )rows * cols * sizeof (int8_t );
1136+ size_t sc_bytes = (size_t )rows * n_blocks_per_row * sizeof (float );
1137+
1138+ int8_t * qs = (int8_t * )(* buf + * used );
1139+ * used += qs_bytes ;
1140+ float * sc = (float * )(* buf + * used );
1141+ * used += sc_bytes ;
1142+
1143+ for (int r = 0 ; r < rows ; r ++ ) {
1144+ tq_quantize_row_q8 (src + (size_t )r * cols ,
1145+ qs + (size_t )r * cols ,
1146+ sc + (size_t )r * n_blocks_per_row ,
1147+ cols );
1148+ }
1149+ * out_qs = qs ;
1150+ * out_scales = sc ;
1151+ }
1152+
1153+ /* Calculate total Q8 buffer size needed for all layer weights */
1154+ static size_t calc_q8_buffer_size (const tq_model_t * model ) {
1155+ size_t total = 0 ;
1156+ const tq_model_config_t * c = & model -> config ;
1157+ int dim = c -> hidden_dim ;
1158+ int q_dim = c -> n_heads * c -> head_dim ;
1159+ int kv_dim = c -> n_kv_heads * c -> head_dim ;
1160+ int inter = c -> intermediate_dim ;
1161+ int qg_dim = c -> attn_output_gate ? q_dim * 2 : q_dim ;
1162+
1163+ /* DeltaNet dimensions */
1164+ int delta_qkv_dim = 3 * c -> delta_n_heads * c -> delta_key_head_dim ;
1165+ int delta_z_dim = c -> delta_n_heads * c -> delta_value_head_dim ;
1166+ int delta_dn = c -> delta_n_heads ;
1167+
1168+ for (int l = 0 ; l < c -> n_layers ; l ++ ) {
1169+ const tq_layer_weights_t * layer = & model -> layers [l ];
1170+
1171+ /* Self-attention weights */
1172+ if (layer -> wq ) {
1173+ int rows = qg_dim ;
1174+ int cols = dim ;
1175+ int nb = (cols + 31 ) / 32 ;
1176+ total += (size_t )rows * cols ; /* int8 data */
1177+ total += (size_t )rows * nb * 4 ; /* float scales */
1178+ }
1179+ if (layer -> wk ) {
1180+ int nb = (dim + 31 ) / 32 ;
1181+ total += (size_t )kv_dim * dim ;
1182+ total += (size_t )kv_dim * nb * 4 ;
1183+ }
1184+ if (layer -> wv ) {
1185+ int nb = (dim + 31 ) / 32 ;
1186+ total += (size_t )kv_dim * dim ;
1187+ total += (size_t )kv_dim * nb * 4 ;
1188+ }
1189+ if (layer -> wo ) {
1190+ int nb = (q_dim + 31 ) / 32 ;
1191+ total += (size_t )dim * q_dim ;
1192+ total += (size_t )dim * nb * 4 ;
1193+ }
1194+
1195+ /* FFN weights */
1196+ if (layer -> w_gate ) {
1197+ int nb = (dim + 31 ) / 32 ;
1198+ total += (size_t )inter * dim ;
1199+ total += (size_t )inter * nb * 4 ;
1200+ }
1201+ if (layer -> w_up ) {
1202+ int nb = (dim + 31 ) / 32 ;
1203+ total += (size_t )inter * dim ;
1204+ total += (size_t )inter * nb * 4 ;
1205+ }
1206+ if (layer -> w_down ) {
1207+ int nb = (inter + 31 ) / 32 ;
1208+ total += (size_t )dim * inter ;
1209+ total += (size_t )dim * nb * 4 ;
1210+ }
1211+
1212+ /* DeltaNet weights */
1213+ if (layer -> delta_in_proj_qkv ) {
1214+ int nb = (dim + 31 ) / 32 ;
1215+ total += (size_t )delta_qkv_dim * dim ;
1216+ total += (size_t )delta_qkv_dim * nb * 4 ;
1217+ }
1218+ if (layer -> delta_in_proj_z ) {
1219+ int nb = (dim + 31 ) / 32 ;
1220+ total += (size_t )delta_z_dim * dim ;
1221+ total += (size_t )delta_z_dim * nb * 4 ;
1222+ }
1223+ if (layer -> delta_in_proj_a ) {
1224+ int nb = (dim + 31 ) / 32 ;
1225+ total += (size_t )delta_dn * dim ;
1226+ total += (size_t )delta_dn * nb * 4 ;
1227+ }
1228+ if (layer -> delta_in_proj_b ) {
1229+ int nb = (dim + 31 ) / 32 ;
1230+ total += (size_t )delta_dn * dim ;
1231+ total += (size_t )delta_dn * nb * 4 ;
1232+ }
1233+ if (layer -> delta_out_proj ) {
1234+ int nb = (delta_z_dim + 31 ) / 32 ;
1235+ total += (size_t )dim * delta_z_dim ;
1236+ total += (size_t )dim * nb * 4 ;
1237+ }
1238+ }
1239+ return total ;
1240+ }
1241+
1242+ void tq_quantize_weights (tq_model_t * model ) {
1243+ if (!model || model -> use_q8_weights ) return ;
1244+
1245+ const tq_model_config_t * c = & model -> config ;
1246+ int dim = c -> hidden_dim ;
1247+ int q_dim = c -> n_heads * c -> head_dim ;
1248+ int kv_dim = c -> n_kv_heads * c -> head_dim ;
1249+ int inter = c -> intermediate_dim ;
1250+ int qg_dim = c -> attn_output_gate ? q_dim * 2 : q_dim ;
1251+
1252+ /* DeltaNet dimensions */
1253+ int delta_qkv_dim = 3 * c -> delta_n_heads * c -> delta_key_head_dim ;
1254+ int delta_z_dim = c -> delta_n_heads * c -> delta_value_head_dim ;
1255+ int delta_dn = c -> delta_n_heads ;
1256+
1257+ size_t buf_size = calc_q8_buffer_size (model );
1258+ char * buf = (char * )malloc (buf_size );
1259+ if (!buf ) {
1260+ fprintf (stderr , "tq_quantize_weights: failed to allocate %zu MB for Q8\n" ,
1261+ buf_size / (1024 * 1024 ));
1262+ return ;
1263+ }
1264+ size_t used = 0 ;
1265+
1266+ for (int l = 0 ; l < c -> n_layers ; l ++ ) {
1267+ tq_layer_weights_t * layer = & model -> layers [l ];
1268+
1269+ /* Self-attention */
1270+ quantize_matrix_q8 (layer -> wq , qg_dim , dim ,
1271+ & layer -> wq_q8 , & layer -> wq_q8s , & buf , & used );
1272+ if (layer -> wq_q8 ) layer -> wq = NULL ;
1273+
1274+ quantize_matrix_q8 (layer -> wk , kv_dim , dim ,
1275+ & layer -> wk_q8 , & layer -> wk_q8s , & buf , & used );
1276+ if (layer -> wk_q8 ) layer -> wk = NULL ;
1277+
1278+ quantize_matrix_q8 (layer -> wv , kv_dim , dim ,
1279+ & layer -> wv_q8 , & layer -> wv_q8s , & buf , & used );
1280+ if (layer -> wv_q8 ) layer -> wv = NULL ;
1281+
1282+ quantize_matrix_q8 (layer -> wo , dim , q_dim ,
1283+ & layer -> wo_q8 , & layer -> wo_q8s , & buf , & used );
1284+ if (layer -> wo_q8 ) layer -> wo = NULL ;
1285+
1286+ /* FFN */
1287+ quantize_matrix_q8 (layer -> w_gate , inter , dim ,
1288+ & layer -> w_gate_q8 , & layer -> w_gate_q8s , & buf , & used );
1289+ if (layer -> w_gate_q8 ) layer -> w_gate = NULL ;
1290+
1291+ quantize_matrix_q8 (layer -> w_up , inter , dim ,
1292+ & layer -> w_up_q8 , & layer -> w_up_q8s , & buf , & used );
1293+ if (layer -> w_up_q8 ) layer -> w_up = NULL ;
1294+
1295+ quantize_matrix_q8 (layer -> w_down , dim , inter ,
1296+ & layer -> w_down_q8 , & layer -> w_down_q8s , & buf , & used );
1297+ if (layer -> w_down_q8 ) layer -> w_down = NULL ;
1298+
1299+ /* DeltaNet */
1300+ quantize_matrix_q8 (layer -> delta_in_proj_qkv , delta_qkv_dim , dim ,
1301+ & layer -> delta_in_proj_qkv_q8 , & layer -> delta_in_proj_qkv_q8s ,
1302+ & buf , & used );
1303+ if (layer -> delta_in_proj_qkv_q8 ) layer -> delta_in_proj_qkv = NULL ;
1304+
1305+ quantize_matrix_q8 (layer -> delta_in_proj_z , delta_z_dim , dim ,
1306+ & layer -> delta_in_proj_z_q8 , & layer -> delta_in_proj_z_q8s ,
1307+ & buf , & used );
1308+ if (layer -> delta_in_proj_z_q8 ) layer -> delta_in_proj_z = NULL ;
1309+
1310+ quantize_matrix_q8 (layer -> delta_in_proj_a , delta_dn , dim ,
1311+ & layer -> delta_in_proj_a_q8 , & layer -> delta_in_proj_a_q8s ,
1312+ & buf , & used );
1313+ if (layer -> delta_in_proj_a_q8 ) layer -> delta_in_proj_a = NULL ;
1314+
1315+ quantize_matrix_q8 (layer -> delta_in_proj_b , delta_dn , dim ,
1316+ & layer -> delta_in_proj_b_q8 , & layer -> delta_in_proj_b_q8s ,
1317+ & buf , & used );
1318+ if (layer -> delta_in_proj_b_q8 ) layer -> delta_in_proj_b = NULL ;
1319+
1320+ quantize_matrix_q8 (layer -> delta_out_proj , dim , delta_z_dim ,
1321+ & layer -> delta_out_proj_q8 , & layer -> delta_out_proj_q8s ,
1322+ & buf , & used );
1323+ if (layer -> delta_out_proj_q8 ) layer -> delta_out_proj = NULL ;
1324+ }
1325+
1326+ model -> use_q8_weights = 1 ;
1327+ model -> _q8_data = buf ;
1328+ model -> _q8_size = used ;
1329+
1330+ /* If original weights were in conversion buffer (BF16->FP32), free it.
1331+ * The converted_data is no longer needed since all layer weights are now Q8.
1332+ * Note: norm weights, conv1d, bias, etc. still point into converted_data or mmap,
1333+ * so we CANNOT free it. Keep it alive. */
1334+
1335+ fprintf (stderr , "tq_quantize_weights: quantized to Q8 (%zu MB, was ~%zu MB FP32)\n" ,
1336+ used / (1024 * 1024 ), used * 4 / (1024 * 1024 ));
1337+ }
1338+
11101339/* ============================================================
11111340 * Free model
11121341 * ============================================================ */
@@ -1120,6 +1349,7 @@ void tq_free_model(tq_model_t* model) {
11201349#endif
11211350
11221351 free (model -> _converted_data );
1352+ free (model -> _q8_data );
11231353 free (model -> attn_layer_indices );
11241354 free (model -> layers );
11251355 free (model );
0 commit comments