quantumaikr
diff --git a/‎.claude/state.md‎
Lines changed: 55 additions & 1 deletion b/‎.claude/state.md‎
Lines changed: 55 additions & 1 deletion
diff --git a/‎CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎docs/plan/prd/prd_v1.0.md‎
Lines changed: 20 additions & 0 deletions b/‎docs/plan/prd/prd_v1.0.md‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎include/turboquant/tq_engine.h‎
Lines changed: 67 additions & 0 deletions b/‎include/turboquant/tq_engine.h‎
Lines changed: 67 additions & 0 deletions
@@ -1,6 +1,6 @@
 # TurboQuant.cpp — Session State
 
-**Last updated**: 2026-03-29 (v0.9.1 non-matmul overhead optimization)
+**Last updated**: 2026-03-29 (v0.9.2 TQM format for instant model loading)
 **Last commit**: pending
 
 ## Speed Progression
@@ -61,6 +61,60 @@ llama.cpp Q4_K_M:  ~50   tok/s  ← target
 - `src/engine/tq_ops.c` — Added tq_matmul_q4_preq(), fixed unused var warning
 - `include/turboquant/tq_engine.h` — Added tq_matmul_q4_preq() declaration
 
+## v0.9.2 Changes — TQM Format (Instant Model Loading)
+
+### Problem
+Loading safetensors BF16 models requires: mmap → parse JSON → BF16→FP32 convert → Q4 quantize.
+This takes ~6s for an 0.8B model. Goal: <0.5s via pre-quantized mmap-ready format.
+
+### Solution: TQM (TurboQuant Model) binary format
+- 512-byte packed header (tqm_header_t) with full model config
+- Embedded tokenizer.json (raw bytes, variable size)
+- Pre-quantized Q4 weights + FP32 norms + BF16 embeddings
+- All sections 64-byte aligned for efficient mmap access
+- Zero-copy loading: weight pointers point directly into mmap'd file
+
+### Components Implemented
+1. **Format definition** (`include/turboquant/tq_engine.h`)
+   - `tqm_header_t` — 512-byte packed struct with magic, config, section offsets
+   - `TQM_MAGIC` (0x4D515454 = "TTQM"), `TQM_VERSION` (1), `TQM_ALIGN` (64)
+
+2. **TQM loader** (`src/engine/tq_model.c`)
+   - `tq_load_tqm()` — mmap file, cast header, set weight pointers directly
+   - Zero malloc for weights, zero conversion — all pointers into mmap'd data
+   - `tq_load_model()` auto-detects TQM vs safetensors by magic bytes
+
+3. **TQM saver** (`src/engine/tq_model.c`)
+   - `tq_save_tqm()` — writes header + tokenizer + Q4 weights sequentially
+   - Handles BF16 embed passthrough and FP32→BF16 on-the-fly conversion
+   - Supports tied/untied output weights
+
+4. **Converter tool** (`tools/tq_convert.c`)
+   - CLI: `tq_convert model.safetensors tokenizer.json -o model.tqm`
+   - 3-step pipeline: load → quantize Q4 → write TQM
+
+5. **Tokenizer from memory** (`src/engine/tq_tokenizer.c`)
+   - `tq_load_tokenizer_from_memory()` — parse JSON from buffer
+   - `tq_load_tokenizer_from_tqm()` — extract embedded tokenizer from .tqm file
+   - `tq_run` auto-loads embedded tokenizer when no -t flag given
+
+6. **Tests** (`tests/test_tqm.cpp`)
+   - Header size verification (512 bytes)
+   - Magic value verification
+   - Save/load roundtrip with synthetic model (norm + Q4 weight byte-exact match)
+   - Auto-detect format (tq_load_model dispatches correctly)
+   - Tokenizer from-memory loading
+   - All 20 tests pass (6 new TQM tests)
+
+### Files Modified/Created
+- `include/turboquant/tq_engine.h` — tqm_header_t, tq_load_tqm, tq_save_tqm, tq_load_tokenizer_from_memory/tqm
+- `src/engine/tq_model.c` — tq_load_tqm(), tq_save_tqm(), auto-detect in tq_load_model()
+- `src/engine/tq_tokenizer.c` — tq_load_tokenizer_from_memory(), tq_load_tokenizer_from_tqm()
+- `tools/tq_convert.c` — NEW converter tool
+- `tools/tq_run.c` — auto-load embedded tokenizer from TQM
+- `tests/test_tqm.cpp` — NEW test file (6 tests)
+- `CMakeLists.txt` — added tq_convert build target
+
 ## What Needs Work
 1. Measure actual speed improvement (need model file for tq_run)
 2. Q4 quality on short prompts
 
@@ -96,6 +96,10 @@ target_link_libraries(tq_run turboquant)
 add_executable(debug_compare tools/debug_compare.c)
 target_link_libraries(debug_compare turboquant)
 
+# TQM converter tool
+add_executable(tq_convert tools/tq_convert.c)
+target_link_libraries(tq_convert turboquant)
+
 # Examples (always built)
 file(GLOB EXAMPLE_C_SOURCES examples/*.c)
 file(GLOB EXAMPLE_CXX_SOURCES examples/*.cpp)
 
@@ -0,0 +1,20 @@
+# TurboQuant.cpp — PRD v1.0: TQM Format + Instant Loading
+
+**목표**: 로딩 6초 → 0.1초, 메모리 2.7GB → 270MB, 추론 속도 동일 유지
+
+## 핵심
+
+사전 양자화된 `.tqm` (TurboQuant Model) 포맷을 설계하여:
+1. **mmap 즉시 로딩** — 변환 불필요, 포인터 설정만
+2. **메모리 10x 절약** — Q4 가중치가 디스크에 이미 양자화
+3. **정확도 동일** — 같은 Q4 데이터, bit-exact
+
+## 성공 기준
+
+| 지표 | 현재 (safetensors) | 목표 (.tqm) |
+|------|-------------------|-------------|
+| 로딩 시간 | 6초 | **< 0.5초** |
+| 피크 메모리 | 2.7 GB | **< 400 MB** |
+| 추론 속도 | 16 tok/s | **16 tok/s** (동일) |
+| 텍스트 품질 | "Paris" ✓ | **동일** (bit-exact) |
+| 파일 크기 | 1.7 GB (BF16) | **~300 MB** (Q4) |
@@ -216,12 +216,77 @@ typedef struct {
     int* merge_pairs;    /* [n_merges * 3]: (token_a, token_b, result_id) */
 } tq_tokenizer_t;
 
+/* ============================================================
+ * TQM (TurboQuant Model) binary format — pre-quantized, mmap-ready
+ *
+ * File layout:
+ *   [0..511]          tqm_header_t  (512 bytes, aligned)
+ *   [tok_off..+tok_sz] Tokenizer JSON (raw bytes)
+ *   [wt_off..+wt_sz]  Weights (Q4 packed + FP32 norms + BF16 embeds)
+ *
+ * All weight sections are 64-byte aligned for efficient mmap access.
+ * Q4 weights are stored as (packed_bytes, float_scales) per matrix.
+ * ============================================================ */
+
+#define TQM_MAGIC   0x4D515454  /* "TTQM" in little-endian */
+#define TQM_VERSION 1
+#define TQM_ALIGN   64          /* alignment for weight sections */
+
+#pragma pack(push, 1)
+typedef struct {
+    uint32_t magic;           /* TQM_MAGIC */
+    uint32_t version;         /* TQM_VERSION */
+
+    /* Model config (mirrors tq_model_config_t) */
+    int32_t n_layers;
+    int32_t hidden_dim;
+    int32_t intermediate_dim;
+    int32_t n_heads;
+    int32_t n_kv_heads;
+    int32_t head_dim;
+    int32_t vocab_size;
+    int32_t max_seq_len;
+    float   rope_freq_base;
+    float   rms_norm_eps;
+
+    /* DeltaNet config */
+    int32_t delta_n_heads;
+    int32_t delta_key_head_dim;
+    int32_t delta_value_head_dim;
+    int32_t delta_conv_width;
+    float   partial_rotary_factor;
+    int32_t use_qk_norm;
+    int32_t attn_output_gate;
+
+    /* Quantization config */
+    int32_t weight_quant;     /* 0=FP32, 4=Q4, 8=Q8 */
+    int32_t embed_format;     /* 0=FP32, 16=BF16 */
+
+    /* Section offsets (from file start) */
+    uint64_t tokenizer_offset;
+    uint64_t tokenizer_size;
+    uint64_t weights_offset;
+    uint64_t weights_size;
+
+    /* Layer type map */
+    int32_t n_attn_layers;
+    int32_t attn_layer_indices[64]; /* which layers are self_attn (max 64) */
+
+    /* Padding to 512 bytes.
+     * With pack(1): 8+32+8+16+12+8+32+260 = 376 used, 136 pad */
+    uint8_t _pad[136];
+} tqm_header_t;
+#pragma pack(pop)
+
 /* ============================================================
  * API
  * ============================================================ */
 
 /* Model loading */
 tq_model_t* tq_load_model(const char* path);
+tq_model_t* tq_load_tqm(const char* path);
+int tq_save_tqm(tq_model_t* model, const char* tokenizer_path,
+                const char* output_path);
 void tq_free_model(tq_model_t* model);
 
 /* State management */
@@ -243,6 +308,8 @@ int tq_sample_topp(const float* logits, int vocab_size,
 
 /* Tokenizer */
 tq_tokenizer_t* tq_load_tokenizer(const char* path);
+tq_tokenizer_t* tq_load_tokenizer_from_memory(const char* data, size_t size);
+tq_tokenizer_t* tq_load_tokenizer_from_tqm(const char* tqm_path);
 void tq_free_tokenizer(tq_tokenizer_t* tok);
 int tq_encode(const tq_tokenizer_t* tok, const char* text,
               int* tokens, int max_tokens, int add_bos);