|
| 1 | +/** |
| 2 | + * TurboQuant — Qwen3.5-0.8B KV Cache Validation |
| 3 | + * |
| 4 | + * Validates all quantization types (including v0.6: RHT, mixed precision) |
| 5 | + * on Qwen3.5-0.8B architecture: 2 KV heads, 256 head_dim, hybrid attention. |
| 6 | + */ |
| 7 | + |
| 8 | +extern "C" { |
| 9 | +#include "turboquant/turboquant.h" |
| 10 | + |
| 11 | +void tq_uniform_4b_quantize_ref(const float* src, void* dst, int n); |
| 12 | +void tq_uniform_4b_dequantize_ref(const void* src, float* dst, int n); |
| 13 | +void tq_uniform_2b_quantize_ref(const float* src, void* dst, int n); |
| 14 | +void tq_uniform_2b_dequantize_ref(const void* src, float* dst, int n); |
| 15 | +void tq_polar_quantize_ref(const float* src, void* dst, int n); |
| 16 | +void tq_polar_dequantize_ref(const void* src, float* dst, int n); |
| 17 | +void tq_qjl_quantize_ref(const float* src, void* dst, int n); |
| 18 | +void tq_qjl_dequantize_ref(const void* src, float* dst, int n); |
| 19 | +void tq_turbo_quantize_ref(const float* src, void* dst, int n); |
| 20 | +void tq_turbo_dequantize_ref(const void* src, float* dst, int n); |
| 21 | +void tq_mixed_4b8_quantize_ref(const float* src, void* dst, int n); |
| 22 | +void tq_mixed_4b8_dequantize_ref(const void* src, float* dst, int n); |
| 23 | +} |
| 24 | + |
| 25 | +#include <cstdio> |
| 26 | +#include <cstdlib> |
| 27 | +#include <cstring> |
| 28 | +#include <cmath> |
| 29 | +#include <vector> |
| 30 | +#include <string> |
| 31 | + |
| 32 | +#define MAGIC 0x544B5651 |
| 33 | + |
| 34 | +struct LayerData { |
| 35 | + int layer_idx; |
| 36 | + int num_heads; |
| 37 | + int seq_len; |
| 38 | + int head_dim; |
| 39 | + std::vector<float> keys; /* [num_heads * seq_len * head_dim] */ |
| 40 | +}; |
| 41 | + |
| 42 | +static bool load_layer(const char* path, LayerData& out) { |
| 43 | + FILE* f = fopen(path, "rb"); |
| 44 | + if (!f) return false; |
| 45 | + |
| 46 | + uint32_t hdr[5]; |
| 47 | + if (fread(hdr, sizeof(uint32_t), 5, f) != 5) { fclose(f); return false; } |
| 48 | + if (hdr[0] != MAGIC) { fclose(f); return false; } |
| 49 | + |
| 50 | + out.layer_idx = (int)hdr[1]; |
| 51 | + out.num_heads = (int)hdr[2]; |
| 52 | + out.seq_len = (int)hdr[3]; |
| 53 | + out.head_dim = (int)hdr[4]; |
| 54 | + |
| 55 | + size_t total = (size_t)out.num_heads * out.seq_len * out.head_dim; |
| 56 | + out.keys.resize(total); |
| 57 | + size_t read = fread(out.keys.data(), sizeof(float), total, f); |
| 58 | + fclose(f); |
| 59 | + return read == total; |
| 60 | +} |
| 61 | + |
| 62 | +struct TypeInfo { |
| 63 | + const char* name; |
| 64 | + tq_type type; |
| 65 | + tq_quantize_fn quantize; |
| 66 | + tq_dequantize_fn dequantize; |
| 67 | + size_t block_size_bytes; |
| 68 | + int block_elems; |
| 69 | +}; |
| 70 | + |
| 71 | +int main() { |
| 72 | + printf("\n"); |
| 73 | + printf("================================================================\n"); |
| 74 | + printf(" TurboQuant — Qwen3.5-0.8B KV Cache Validation\n"); |
| 75 | + printf(" Architecture: Hybrid (DeltaNet + Gated Attention)\n"); |
| 76 | + printf(" KV Heads: 2, Head Dim: 256, Attention Layers: 6/24\n"); |
| 77 | + printf("================================================================\n\n"); |
| 78 | + |
| 79 | + /* Load layers */ |
| 80 | + const char* base = "spec/test_vectors/qwen35_kv"; |
| 81 | + int layer_indices[] = {3, 7, 11, 15}; |
| 82 | + std::vector<LayerData> layers; |
| 83 | + |
| 84 | + for (int li : layer_indices) { |
| 85 | + char path[256]; |
| 86 | + snprintf(path, sizeof(path), "%s/layer%d_keys.bin", base, li); |
| 87 | + LayerData ld; |
| 88 | + if (load_layer(path, ld)) { |
| 89 | + printf("Loaded layer %d: %d heads x %d seq x %d dim\n", |
| 90 | + ld.layer_idx, ld.num_heads, ld.seq_len, ld.head_dim); |
| 91 | + layers.push_back(ld); |
| 92 | + } |
| 93 | + } |
| 94 | + |
| 95 | + if (layers.empty()) { |
| 96 | + printf("ERROR: No data found. Run: python3 tests/reference/dump_qwen35_kv.py\n"); |
| 97 | + return 1; |
| 98 | + } |
| 99 | + |
| 100 | + int head_dim = layers[0].head_dim; |
| 101 | + |
| 102 | + /* Type table */ |
| 103 | + TypeInfo types[] = { |
| 104 | + {"uniform_4b", TQ_TYPE_UNIFORM_4B, tq_uniform_4b_quantize_ref, tq_uniform_4b_dequantize_ref, |
| 105 | + sizeof(block_tq_uniform_4b), TQ_BK}, |
| 106 | + {"uniform_2b", TQ_TYPE_UNIFORM_2B, tq_uniform_2b_quantize_ref, tq_uniform_2b_dequantize_ref, |
| 107 | + sizeof(block_tq_uniform_2b), TQ_BK}, |
| 108 | + {"polar_4b", TQ_TYPE_POLAR_4B, tq_polar_quantize_ref, tq_polar_dequantize_ref, |
| 109 | + sizeof(block_tq_polar), TQ_BK}, |
| 110 | + {"qjl_1b", TQ_TYPE_QJL_1B, tq_qjl_quantize_ref, tq_qjl_dequantize_ref, |
| 111 | + sizeof(block_tq_qjl), TQ_BK_QJL}, |
| 112 | + {"turbo_3b", TQ_TYPE_TURBO_3B, tq_turbo_quantize_ref, tq_turbo_dequantize_ref, |
| 113 | + sizeof(block_tq_turbo), TQ_BK}, |
| 114 | + {"mixed_4b8", TQ_TYPE_MIXED_4B8, tq_mixed_4b8_quantize_ref, tq_mixed_4b8_dequantize_ref, |
| 115 | + sizeof(block_tq_mixed_4b8), TQ_BK}, |
| 116 | + }; |
| 117 | + int n_types = sizeof(types) / sizeof(types[0]); |
| 118 | + |
| 119 | + /* RHT seed */ |
| 120 | + uint32_t rht_seed = 42; |
| 121 | + |
| 122 | + /* ============================================================ |
| 123 | + * Per-type, per-layer quality measurement |
| 124 | + * ============================================================ */ |
| 125 | + printf("\n--- Per-Type Quality (averaged over all layers/heads) ---\n\n"); |
| 126 | + printf("%-14s %8s %12s %12s %s\n", |
| 127 | + "Type", "BPE", "MSE", "Attn Cosine", "Grade"); |
| 128 | + printf("%-14s %8s %12s %12s %s\n", |
| 129 | + "--------------", "--------", "------------", "------------", "-----"); |
| 130 | + |
| 131 | + for (int ti = 0; ti < n_types; ti++) { |
| 132 | + const TypeInfo& t = types[ti]; |
| 133 | + double total_mse = 0; |
| 134 | + double sum_dot = 0, sum_a2 = 0, sum_b2 = 0; |
| 135 | + int count = 0; |
| 136 | + |
| 137 | + for (const auto& layer : layers) { |
| 138 | + int hd = layer.head_dim; |
| 139 | + int blocks_per_key = (hd + t.block_elems - 1) / t.block_elems; |
| 140 | + size_t quant_size = blocks_per_key * t.block_size_bytes; |
| 141 | + |
| 142 | + /* Random query */ |
| 143 | + std::vector<float> query(hd); |
| 144 | + uint32_t qseed = (uint32_t)(layer.layer_idx * 1000 + ti); |
| 145 | + for (int d = 0; d < hd; d++) { |
| 146 | + qseed = qseed * 1664525u + 1013904223u; |
| 147 | + query[d] = ((float)(qseed >> 8) / (float)(1 << 24)) * 2.0f - 1.0f; |
| 148 | + } |
| 149 | + |
| 150 | + for (int h = 0; h < layer.num_heads; h++) { |
| 151 | + for (int s = 0; s < layer.seq_len; s++) { |
| 152 | + const float* key = layer.keys.data() + |
| 153 | + (size_t)h * layer.seq_len * hd + (size_t)s * hd; |
| 154 | + |
| 155 | + /* Quantize + dequantize */ |
| 156 | + std::vector<uint8_t> qbuf(quant_size, 0); |
| 157 | + std::vector<float> deq(hd, 0.0f); |
| 158 | + |
| 159 | + /* Process in blocks */ |
| 160 | + int offset = 0; |
| 161 | + int qoffset = 0; |
| 162 | + while (offset < hd) { |
| 163 | + int chunk = (hd - offset > t.block_elems) ? t.block_elems : (hd - offset); |
| 164 | + t.quantize(key + offset, qbuf.data() + qoffset, chunk); |
| 165 | + t.dequantize(qbuf.data() + qoffset, deq.data() + offset, chunk); |
| 166 | + offset += chunk; |
| 167 | + qoffset += (int)t.block_size_bytes; |
| 168 | + } |
| 169 | + |
| 170 | + /* MSE */ |
| 171 | + double mse = 0; |
| 172 | + for (int d = 0; d < hd; d++) { |
| 173 | + double diff = (double)key[d] - (double)deq[d]; |
| 174 | + mse += diff * diff; |
| 175 | + } |
| 176 | + mse /= hd; |
| 177 | + total_mse += mse; |
| 178 | + |
| 179 | + /* Attention scores */ |
| 180 | + float fp32_dot = 0, quant_dot = 0; |
| 181 | + for (int d = 0; d < hd; d++) { |
| 182 | + fp32_dot += query[d] * key[d]; |
| 183 | + quant_dot += query[d] * deq[d]; |
| 184 | + } |
| 185 | + |
| 186 | + sum_dot += (double)fp32_dot * (double)quant_dot; |
| 187 | + sum_a2 += (double)fp32_dot * (double)fp32_dot; |
| 188 | + sum_b2 += (double)quant_dot * (double)quant_dot; |
| 189 | + count++; |
| 190 | + } |
| 191 | + } |
| 192 | + } |
| 193 | + |
| 194 | + double avg_mse = total_mse / count; |
| 195 | + double cosine = (sum_a2 > 0 && sum_b2 > 0) ? |
| 196 | + sum_dot / (sqrt(sum_a2) * sqrt(sum_b2)) : 0; |
| 197 | + |
| 198 | + float bpe = tq_type_bpe(t.type); |
| 199 | + const char* grade; |
| 200 | + if (cosine > 0.99) grade = "A+"; |
| 201 | + else if (cosine > 0.95) grade = "A"; |
| 202 | + else if (cosine > 0.90) grade = "B+"; |
| 203 | + else if (cosine > 0.80) grade = "B"; |
| 204 | + else grade = "C"; |
| 205 | + |
| 206 | + printf("%-14s %6.1f %10.6f %10.6f %s\n", |
| 207 | + t.name, bpe, avg_mse, cosine, grade); |
| 208 | + |
| 209 | + /* Machine-readable */ |
| 210 | + printf("qwen35_mse_%s=%.6f\n", t.name, avg_mse); |
| 211 | + printf("qwen35_cosine_%s=%.6f\n", t.name, cosine); |
| 212 | + } |
| 213 | + |
| 214 | + /* ============================================================ |
| 215 | + * RHT comparison (uniform_4b with and without RHT) |
| 216 | + * ============================================================ */ |
| 217 | + printf("\n--- RHT A/B Comparison (uniform_4b, head_dim=%d) ---\n\n", head_dim); |
| 218 | + |
| 219 | + double rht_mse_sum = 0, raw_mse_sum = 0; |
| 220 | + int rht_count = 0; |
| 221 | + |
| 222 | + for (const auto& layer : layers) { |
| 223 | + int hd = layer.head_dim; |
| 224 | + int blocks = (hd + TQ_BK - 1) / TQ_BK; |
| 225 | + size_t qsize = blocks * sizeof(block_tq_uniform_4b); |
| 226 | + |
| 227 | + for (int h = 0; h < layer.num_heads; h++) { |
| 228 | + for (int s = 0; s < layer.seq_len; s++) { |
| 229 | + const float* key = layer.keys.data() + |
| 230 | + (size_t)h * layer.seq_len * hd + (size_t)s * hd; |
| 231 | + |
| 232 | + /* Raw quantize */ |
| 233 | + std::vector<uint8_t> raw_buf(qsize, 0); |
| 234 | + std::vector<float> raw_deq(hd, 0.0f); |
| 235 | + int off = 0, qoff = 0; |
| 236 | + while (off < hd) { |
| 237 | + int chunk = (hd - off > TQ_BK) ? TQ_BK : (hd - off); |
| 238 | + tq_uniform_4b_quantize_ref(key + off, raw_buf.data() + qoff, chunk); |
| 239 | + tq_uniform_4b_dequantize_ref(raw_buf.data() + qoff, raw_deq.data() + off, chunk); |
| 240 | + off += chunk; qoff += sizeof(block_tq_uniform_4b); |
| 241 | + } |
| 242 | + |
| 243 | + /* RHT quantize */ |
| 244 | + std::vector<float> rotated(hd); |
| 245 | + memcpy(rotated.data(), key, hd * sizeof(float)); |
| 246 | + tq_rht_transform(rotated.data(), hd, rht_seed); |
| 247 | + |
| 248 | + std::vector<uint8_t> rht_buf(qsize, 0); |
| 249 | + std::vector<float> rht_deq(hd, 0.0f); |
| 250 | + off = 0; qoff = 0; |
| 251 | + while (off < hd) { |
| 252 | + int chunk = (hd - off > TQ_BK) ? TQ_BK : (hd - off); |
| 253 | + tq_uniform_4b_quantize_ref(rotated.data() + off, rht_buf.data() + qoff, chunk); |
| 254 | + tq_uniform_4b_dequantize_ref(rht_buf.data() + qoff, rht_deq.data() + off, chunk); |
| 255 | + off += chunk; qoff += sizeof(block_tq_uniform_4b); |
| 256 | + } |
| 257 | + tq_rht_inverse(rht_deq.data(), hd, rht_seed); |
| 258 | + |
| 259 | + /* MSE */ |
| 260 | + double raw_mse = 0, rht_mse = 0; |
| 261 | + for (int d = 0; d < hd; d++) { |
| 262 | + double d1 = (double)key[d] - (double)raw_deq[d]; |
| 263 | + double d2 = (double)key[d] - (double)rht_deq[d]; |
| 264 | + raw_mse += d1 * d1; |
| 265 | + rht_mse += d2 * d2; |
| 266 | + } |
| 267 | + raw_mse_sum += raw_mse / hd; |
| 268 | + rht_mse_sum += rht_mse / hd; |
| 269 | + rht_count++; |
| 270 | + } |
| 271 | + } |
| 272 | + } |
| 273 | + |
| 274 | + double raw_avg = raw_mse_sum / rht_count; |
| 275 | + double rht_avg = rht_mse_sum / rht_count; |
| 276 | + printf(" Raw uniform_4b MSE: %.6f\n", raw_avg); |
| 277 | + printf(" RHT+uniform_4b MSE: %.6f\n", rht_avg); |
| 278 | + printf(" Improvement: %.1fx\n", raw_avg / (rht_avg > 0 ? rht_avg : 1e-10)); |
| 279 | + printf("qwen35_rht_raw_mse=%.6f\n", raw_avg); |
| 280 | + printf("qwen35_rht_improved_mse=%.6f\n", rht_avg); |
| 281 | + printf("qwen35_rht_improvement=%.1fx\n", raw_avg / (rht_avg > 0 ? rht_avg : 1e-10)); |
| 282 | + |
| 283 | + /* ============================================================ |
| 284 | + * K/V Asymmetric comparison |
| 285 | + * ============================================================ */ |
| 286 | + printf("\n--- K/V Asymmetric: Key 4-bit + Value 2-bit ---\n\n"); |
| 287 | + float k_bpe = tq_type_bpe(TQ_TYPE_UNIFORM_4B); |
| 288 | + float v_bpe = tq_type_bpe(TQ_TYPE_UNIFORM_2B); |
| 289 | + printf(" Key bits: %.1f (uniform_4b)\n", k_bpe); |
| 290 | + printf(" Value bits: %.1f (uniform_2b)\n", v_bpe); |
| 291 | + printf(" Average: %.2f bits/element\n", (k_bpe + v_bpe) / 2.0f); |
| 292 | + printf(" FP16 equiv: %.1fx compression\n", 32.0f / ((k_bpe + v_bpe) / 2.0f)); |
| 293 | + |
| 294 | + /* ============================================================ |
| 295 | + * Memory impact for Qwen3.5-0.8B |
| 296 | + * ============================================================ */ |
| 297 | + printf("\n--- Memory Impact: Qwen3.5-0.8B ---\n\n"); |
| 298 | + int kv_heads = 2, att_layers = 6; |
| 299 | + int ctx_lengths[] = {4096, 16384, 65536, 131072}; |
| 300 | + printf(" %-8s %-10s %-10s %-10s %-6s\n", |
| 301 | + "Context", "FP16", "Uniform4b", "K4V2", "Saved"); |
| 302 | + |
| 303 | + for (int ctx : ctx_lengths) { |
| 304 | + double fp16 = (double)att_layers * kv_heads * head_dim * ctx * 2 * 2 / (1024.0*1024.0*1024.0); |
| 305 | + double u4b = (double)att_layers * kv_heads * head_dim * ctx * 2 * |
| 306 | + (k_bpe / 8.0) / (1024.0*1024.0*1024.0); |
| 307 | + double k4v2 = (double)att_layers * kv_heads * head_dim * ctx * |
| 308 | + ((k_bpe + v_bpe) / 2.0 / 8.0) / (1024.0*1024.0*1024.0); |
| 309 | + |
| 310 | + char ctx_str[16]; |
| 311 | + if (ctx >= 1024) snprintf(ctx_str, sizeof(ctx_str), "%dK", ctx/1024); |
| 312 | + else snprintf(ctx_str, sizeof(ctx_str), "%d", ctx); |
| 313 | + |
| 314 | + printf(" %-8s %7.2f GB %7.2f GB %7.2f GB %4.0f%%\n", |
| 315 | + ctx_str, fp16, u4b, k4v2, (1.0 - k4v2/fp16)*100); |
| 316 | + } |
| 317 | + |
| 318 | + printf("\n================================================================\n"); |
| 319 | + printf(" Validation Complete\n"); |
| 320 | + printf("================================================================\n\n"); |
| 321 | + |
| 322 | + return 0; |
| 323 | +} |
0 commit comments