Skip to content

Commit aae96aa

Browse files
K3 Block A vast evidence (post smoke fix): full PASS incl. drafter forward
Re-ran scripts/research/k3_feasibility_smoke.py on H200 after the drafter vocab fix landed on main (PR #88: bound random ids by model embedding vocab, not the DFlash tokenizer's vocab_size=1). Reused transformers 5.x venv (.venv-k3) + cached weights — fast (~no download). summary: status=pass, verifier_loadable/forward_ok=true, drafter_loadable/forward_ok=true (was false before the fix). verifier gemma-4-26B-A4B-it: load, forward 2.81 tok/s drafter DFlash: forward OK, 757 tok in 0.421s, logits [1,757,262144] (vocab 262144 detected from embedding) memory peak: verifier 51.6GB -> +drafter 55.3GB -> forward 56.2GB / 150GB H200 Hardware feasibility confirmed. Standing caveats (unchanged, not blockers for this smoke): (1) verifier needs transformers>=5.0 but run_on_vast.sh still pins <5.0 — ran via a manual .venv-k3; (2) DFlash still loads as Qwen3ForCausalLM (fc/hidden_norm unexpected, lm_head/embed_tokens newly init) — forward runs for the feasibility check but the true block-diffusion arch is not yet exercised. Co-authored-by: FluffyAIcode <FluffyAIcode@users.noreply.github.com>
1 parent 91d9647 commit aae96aa

2 files changed

Lines changed: 136 additions & 0 deletions

File tree

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
{
2+
"schema_version": 1,
3+
"kind": "k3_feasibility_smoke",
4+
"config": {
5+
"platform": "cuda",
6+
"verifier_path": "google/gemma-4-26B-A4B-it",
7+
"drafter_id": "z-lab/gemma-4-26B-A4B-it-DFlash",
8+
"prompt_tokens": 512,
9+
"gen_tokens": 8,
10+
"seed": 42,
11+
"skip_drafter": false
12+
},
13+
"stages": [
14+
{
15+
"stage": "baseline",
16+
"memory": {
17+
"label": "baseline",
18+
"platform": "cuda",
19+
"current_allocated_bytes": 0,
20+
"current_reserved_bytes": 0,
21+
"peak_allocated_bytes": 0,
22+
"peak_reserved_bytes": 0,
23+
"device_total_bytes": 150109880320,
24+
"device_name": "NVIDIA H200"
25+
}
26+
},
27+
{
28+
"stage": "verifier_loaded",
29+
"memory": {
30+
"label": "after_verifier_load",
31+
"platform": "cuda",
32+
"current_allocated_bytes": 51611948032,
33+
"current_reserved_bytes": 51636076544,
34+
"peak_allocated_bytes": 51611948032,
35+
"peak_reserved_bytes": 51636076544,
36+
"device_total_bytes": 150109880320,
37+
"device_name": "NVIDIA H200"
38+
},
39+
"verifier_load_seconds": 14.5070095800329,
40+
"verifier_kind": "transformers_bf16_cuda"
41+
},
42+
{
43+
"stage": "drafter_loaded",
44+
"memory": {
45+
"label": "after_drafter_load",
46+
"platform": "cuda",
47+
"current_allocated_bytes": 55328955904,
48+
"current_reserved_bytes": 55348035584,
49+
"peak_allocated_bytes": 55328955904,
50+
"peak_reserved_bytes": 55348035584,
51+
"device_total_bytes": 150109880320,
52+
"device_name": "NVIDIA H200"
53+
},
54+
"drafter_load_seconds": 3.7502827800344676,
55+
"drafter_kind": "transformers_cuda"
56+
},
57+
{
58+
"stage": "verifier_forward",
59+
"memory": {
60+
"label": "after_verifier_forward",
61+
"platform": "cuda",
62+
"current_allocated_bytes": 55362510336,
63+
"current_reserved_bytes": 56193187840,
64+
"peak_allocated_bytes": 56160551936,
65+
"peak_reserved_bytes": 56193187840,
66+
"device_total_bytes": 150109880320,
67+
"device_name": "NVIDIA H200"
68+
},
69+
"metrics": {
70+
"prefill_seconds": 2.558815949014388,
71+
"gen_seconds": 2.8468999969772995,
72+
"gen_tokens": 8,
73+
"tokens_per_sec": 2.8100741186884024,
74+
"gen_text_head": "\nThe Kakeya inference engine validates",
75+
"prompt_token_count": 757
76+
}
77+
},
78+
{
79+
"stage": "drafter_forward",
80+
"memory": {
81+
"label": "after_drafter_forward",
82+
"platform": "cuda",
83+
"current_allocated_bytes": 55362510336,
84+
"current_reserved_bytes": 56193187840,
85+
"peak_allocated_bytes": 56160551936,
86+
"peak_reserved_bytes": 56193187840,
87+
"device_total_bytes": 150109880320,
88+
"device_name": "NVIDIA H200"
89+
},
90+
"metrics": {
91+
"forward_seconds": 0.42071863100863993,
92+
"input_tokens": 757,
93+
"output_logits_shape": [
94+
1,
95+
757,
96+
262144
97+
],
98+
"drafter_vocab_size_used": 262144
99+
}
100+
}
101+
],
102+
"summary": {
103+
"status": "pass",
104+
"verifier_loadable": true,
105+
"verifier_forward_ok": true,
106+
"drafter_loadable": true,
107+
"drafter_forward_ok": true
108+
}
109+
}

results/research/logs/k3_feasibility_smoke_vast_blockA_1780982359.log

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
[k3-smoke] platform: cuda
2+
[k3-smoke] verifier: google/gemma-4-26B-A4B-it
3+
[k3-smoke] drafter: z-lab/gemma-4-26B-A4B-it-DFlash
4+
[k3-smoke] prompt n: 512
5+
[k3-smoke] gen n: 8
6+
[k3-smoke] loading verifier (CUDA bf16): google/gemma-4-26B-A4B-it
7+
Loading weights: 0%| | 0/1013 [00:00<?, ?it/s]Loading weights: 0%| | 2/1013 [00:00<01:47, 9.42it/s]Loading weights: 0%| | 4/1013 [00:00<02:09, 7.79it/s]Loading weights: 2%|▏ | 25/1013 [00:00<00:18, 54.75it/s]Loading weights: 3%|▎ | 34/1013 [00:00<00:19, 50.62it/s]Loading weights: 5%|▍ | 48/1013 [00:01<00:18, 52.55it/s]Loading weights: 7%|▋ | 69/1013 [00:01<00:14, 62.95it/s]Loading weights: 9%|▉ | 91/1013 [00:01<00:10, 89.67it/s]Loading weights: 10%|█ | 103/1013 [00:01<00:10, 83.45it/s]Loading weights: 11%|█▏ | 114/1013 [00:01<00:13, 66.01it/s]Loading weights: 13%|█▎ | 135/1013 [00:02<00:12, 71.28it/s]Loading weights: 15%|█▌ | 157/1013 [00:02<00:11, 75.20it/s]Loading weights: 18%|█▊ | 178/1013 [00:02<00:08, 95.71it/s]Loading weights: 19%|█▉ | 190/1013 [00:02<00:09, 88.95it/s]Loading weights: 20%|█▉ | 201/1013 [00:02<00:11, 71.81it/s]Loading weights: 22%|██▏ | 222/1013 [00:03<00:08, 92.72it/s]Loading weights: 23%|██▎ | 234/1013 [00:03<00:09, 81.42it/s]Loading weights: 24%|██▍ | 244/1013 [00:03<00:09, 82.49it/s]Loading weights: 25%|██▌ | 254/1013 [00:03<00:10, 71.50it/s]Loading weights: 26%|██▌ | 265/1013 [00:03<00:13, 55.42it/s]Loading weights: 28%|██▊ | 287/1013 [00:04<00:09, 80.19it/s]Loading weights: 29%|██▉ | 298/1013 [00:04<00:09, 71.97it/s]Loading weights: 31%|███ | 309/1013 [00:04<00:09, 77.48it/s]Loading weights: 31%|███▏ | 319/1013 [00:04<00:10, 67.90it/s]Loading weights: 33%|███▎ | 331/1013 [00:04<00:11, 57.42it/s]Loading weights: 35%|███▍ | 353/1013 [00:04<00:08, 82.31it/s]Loading weights: 36%|███▌ | 364/1013 [00:05<00:08, 73.06it/s]Loading weights: 37%|███▋ | 376/1013 [00:05<00:10, 60.67it/s]Loading weights: 39%|███▉ | 396/1013 [00:05<00:07, 79.55it/s]Loading weights: 40%|████ | 406/1013 [00:05<00:08, 71.35it/s]Loading weights: 41%|████▏ | 419/1013 [00:06<00:09, 62.67it/s]Loading weights: 43%|████▎ | 440/1013 [00:06<00:06, 86.46it/s]Loading weights: 45%|████▍ | 452/1013 [00:06<00:07, 79.69it/s]Loading weights: 46%|████▌ | 462/1013 [00:06<00:06, 82.66it/s]Loading weights: 47%|████▋ | 472/1013 [00:06<00:07, 75.41it/s]Loading weights: 48%|████▊ | 485/1013 [00:06<00:07, 67.05it/s]Loading weights: 50%|████▉ | 506/1013 [00:06<00:05, 91.41it/s]Loading weights: 51%|█████ | 517/1013 [00:07<00:05, 84.15it/s]Loading weights: 52%|█████▏ | 528/1013 [00:07<00:07, 67.84it/s]Loading weights: 54%|█████▍ | 549/1013 [00:07<00:05, 92.46it/s]Loading weights: 55%|█████▌ | 561/1013 [00:07<00:05, 83.42it/s]Loading weights: 56%|█████▋ | 572/1013 [00:07<00:06, 68.29it/s]Loading weights: 59%|█████▊ | 594/1013 [00:08<00:05, 73.86it/s]Loading weights: 61%|██████ | 615/1013 [00:08<00:04, 90.63it/s]Loading weights: 62%|██████▏ | 626/1013 [00:08<00:04, 81.95it/s]Loading weights: 63%|██████▎ | 638/1013 [00:08<00:05, 68.62it/s]Loading weights: 78%|███████▊ | 791/1013 [00:08<00:00, 314.59it/s]Loading weights: 96%|█████████▌| 973/1013 [00:08<00:00, 605.52it/s]Loading weights: 100%|██████████| 1013/1013 [00:08<00:00, 112.82it/s]
8+
[k3-smoke] verifier loaded in 14.5s
9+
[k3-smoke] loading drafter (cuda): z-lab/gemma-4-26B-A4B-it-DFlash
10+
Loading weights: 0%| | 0/56 [00:00<?, ?it/s]Loading weights: 66%|██████▌ | 37/56 [00:00<00:00, 354.44it/s]Loading weights: 100%|██████████| 56/56 [00:00<00:00, 377.28it/s]
11+
[transformers] Qwen3ForCausalLM LOAD REPORT from: z-lab/gemma-4-26B-A4B-it-DFlash
12+
Key | Status |
13+
--------------------------+------------+-
14+
hidden_norm.weight | UNEXPECTED |
15+
fc.weight | UNEXPECTED |
16+
lm_head.weight | MISSING |
17+
model.embed_tokens.weight | MISSING |
18+
19+
Notes:
20+
- UNEXPECTED: can be ignored when loading from different task/architecture; not ok if you expect identical arch.
21+
- MISSING: those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.
22+
[k3-smoke] drafter loaded in 3.8s
23+
[transformers] The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
24+
[k3-smoke] verifier forward OK; gen=8 tokens in 2.85s (2.81 tok/s)
25+
[k3-smoke] drafter forward OK; 757 tokens in 0.421s
26+
[k3-smoke] report -> results/research/k3_feasibility_smoke_vast_blockA_1780982359.json
27+
[k3-smoke] PASS

0 commit comments

Comments
 (0)