llama.cpp-dflash-ggml/autoresearch.jsonl at codex/dflash-ddtree-server-cache-chainonly · Leechael/llama.cpp-dflash-ggml · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
{"type":"config","name":"Optimize DFlash DDTree TPS on Castle","metricName":"tps","metricUnit":"","bestDirection":"higher"}
{"run":1,"commit":"b63fd0d","metric":0,"metrics":{"tps":0},"status":"crash","description":"baseline harness failed before benchmark during sync/ssh setup","timestamp":1777494462566,"segment":0,"confidence":null,"asi":{"hypothesis":"establish Castle e2e baseline using local-to-remote sync and CUDA harness","rollback_reason":"script exited with ssh/rsync code 255 before producing output","next_action_hint":"debug autoresearch.sh sync command; likely rsync invoking ssh differently than direct ssh"}}
{"run":2,"commit":"b63fd0d","metric":0,"metrics":{"tps":0},"status":"crash","description":"baseline harness failed due shell quoting in remote kill command","timestamp":1777494502610,"segment":0,"confidence":null,"asi":{"hypothesis":"fix pkill self-termination in Castle benchmark harness","rollback_reason":"bash expanded $1 locally under set -u in the awk kill snippet","next_action_hint":"escape awk field as \\$1 or avoid awk entirely with pgrep pattern"}}
{"run":3,"commit":"0374171","metric":0.665751,"metrics":{"tps":0.665751,"spec_sec":24.033,"gen_tokens":16,"steps":9,"committed":0,"step_ms":282.13,"pack_ms":10.25,"draft_ms":203.26,"topk_ms":20.85,"exact_ms":47.73,"exact_decode_ms":0,"acceptance":1.889},"status":"keep","description":"baseline Castle e2e chain-only DDTree benchmark with 1024 target feature window","timestamp":1777494603864,"segment":0,"confidence":null,"asi":{"hypothesis":"establish baseline using current stable chain-only exact validation, full target/draft GPU offload, q4 KV, 64k ctx, real rendered prompt","benchmark":"./autoresearch.sh on Castle; gen=16; prompt=/tmp/real_rendered_prompt.txt; LLAMA_DDTREE_TARGET_FEAT_CTX=1024","note":"committed parser returned 0 likely due regex choosing absent/overwritten field; acceptance parsed from e2e output is the useful commit-rate signal"}}
{"type":"config","name":"Optimize DFlash DDTree decode TPS on Castle","metricName":"tps","metricUnit":"","bestDirection":"higher"}
{"run":4,"commit":"6510b31","metric":6.419361,"metrics":{"tps":6.419361,"e2e_tps":1.197874,"spec_sec":26.714,"gen_tokens":32,"steps":18,"committed":0,"step_ms":276.94,"pack_ms":8.88,"draft_ms":201.75,"topk_ms":20.64,"exact_ms":45.63,"exact_decode_ms":0,"acceptance":1.833},"status":"keep","description":"decode-TPS baseline with gen32 after fixing primary metric to exclude prompt/model overhead","timestamp":1777494737663,"segment":1,"confidence":null,"asi":{"hypothesis":"use decode-only TPS as the primary metric because current optimization target is per-step decode cost, not prompt prefill/model load","benchmark":"Castle e2e gen=32, real_rendered_prompt, 64k ctx, q4 KV, target_feat_ctx=1024","note":"committed parser still reports 0 due escaped whitespace regex; output shows committed=33, fix parser next"}}
{"run":5,"commit":"6510b31","metric":6.32999,"metrics":{"tps":6.32999,"e2e_tps":1.207183,"spec_sec":26.508,"gen_tokens":32,"steps":18,"committed":33,"step_ms":280.85,"pack_ms":8.91,"draft_ms":204.58,"topk_ms":21.5,"exact_ms":45.83,"exact_decode_ms":0,"acceptance":1.833},"status":"discard","description":"fix committed metric parser; benchmark noise slightly lower than baseline","timestamp":1777494820583,"segment":1,"confidence":null,"asi":{"hypothesis":"correct benchmark committed-token parsing without changing runtime code","rollback_reason":"primary decode TPS was slightly worse than baseline, likely noise; no runtime optimization tested","next_action_hint":"parser fix is in autoresearch.sh and should be preserved as an autoresearch file; proceed to source-level optimization"}}
{"run":6,"commit":"73f2fb0","metric":7.73072,"metrics":{"tps":7.73072,"e2e_tps":1.251564,"spec_sec":25.568,"gen_tokens":32,"steps":17,"committed":33,"step_ms":243.49,"pack_ms":4.49,"draft_ms":171.03,"topk_ms":19.51,"exact_ms":48.43,"exact_decode_ms":0,"acceptance":1.941},"status":"keep","description":"benchmark target feature window 512 for lower draft/pack cost","timestamp":1777494901661,"segment":1,"confidence":14.673204954627211,"asi":{"hypothesis":"smaller DFlash target feature context may reduce pack/upload and draft compute enough to improve decode TPS on the real rendered prompt","result":"LLAMA_DDTREE_TARGET_FEAT_CTX=512 improved decode TPS from 6.42 to 7.73; pack 8.88->4.49 ms, draft 201.75->171.03 ms, acceptance 1.83->1.94","next_action_hint":"confirm 512 on repeated run and test 256/768; if stable, consider default/window recommendation for server workload"}}
{"run":7,"commit":"2a8eba2","metric":8.388756,"metrics":{"tps":8.388756,"e2e_tps":1.262128,"spec_sec":25.354,"gen_tokens":32,"steps":17,"committed":33,"step_ms":224.39,"pack_ms":2.24,"draft_ms":154.39,"topk_ms":19.77,"exact_ms":47.96,"exact_decode_ms":0,"acceptance":1.941},"status":"keep","description":"benchmark target feature window 256 for further draft/pack reduction","timestamp":1777494974670,"segment":1,"confidence":2.8119551947912877,"asi":{"hypothesis":"target feature context can be reduced below 512 on this prompt without hurting proposal quality, further lowering draft compute and pack cost","result":"256 improved decode TPS to 8.39; draft dropped to 154 ms and pack to 2.24 ms with same acceptance as 512","next_action_hint":"test 128 and 384; watch acceptance/regressions because too short a feature window may harm draft quality on broader prompts"}}
{"run":8,"commit":"3f125a4","metric":8.751467,"metrics":{"tps":8.751467,"e2e_tps":1.277649,"spec_sec":25.046,"gen_tokens":32,"steps":17,"committed":33,"step_ms":215.09,"pack_ms":1.09,"draft_ms":144.76,"topk_ms":20.79,"exact_ms":48.41,"exact_decode_ms":0,"acceptance":1.941},"status":"keep","description":"benchmark target feature window 128","timestamp":1777495047214,"segment":1,"confidence":2.2847052207843856,"asi":{"hypothesis":"a 128-token target feature window may retain enough recent signal while reducing draft attention and upload cost","result":"128 improved decode TPS to 8.75; draft 144.76 ms, pack 1.09 ms, acceptance unchanged vs 256/512","next_action_hint":"test 64 and longer generation/prompt coverage before changing default; risk is hidden acceptance drop on other prompts"}}
{"run":9,"commit":"3f125a4","metric":8.481742,"metrics":{"tps":8.481742,"e2e_tps":1.274139,"spec_sec":25.115,"gen_tokens":32,"steps":17,"committed":33,"step_ms":221.93,"pack_ms":0.47,"draft_ms":153.01,"topk_ms":20.45,"exact_ms":47.96,"exact_decode_ms":0,"acceptance":1.941},"status":"discard","description":"benchmark target feature window 64; pack lower but draft slower than 128","timestamp":1777495120133,"segment":1,"confidence":4.187908592095226,"asi":{"hypothesis":"shrinking target feature window below 128 may further reduce pack and draft context work","rollback_reason":"64 lowered pack to 0.47 ms but draft compute rose to 153 ms and decode TPS fell below the 128-window best","next_action_hint":"treat 128 as the current sweet spot on this prompt; test 96/160 only if tuning window further matters"}}
{"run":10,"commit":"3f125a4","metric":7.958279,"metrics":{"tps":7.958279,"e2e_tps":1.256676,"spec_sec":25.464,"gen_tokens":32,"steps":19,"committed":33,"step_ms":211.63,"pack_ms":0.79,"draft_ms":147.96,"topk_ms":19.88,"exact_ms":42.96,"exact_decode_ms":0,"acceptance":1.737},"status":"discard","description":"benchmark target feature window 96; lower per-step cost lost acceptance","timestamp":1777495203736,"segment":1,"confidence":4.455149647635071,"asi":{"hypothesis":"96-token target feature context might sit between 64 and 128 with lower draft cost but enough proposal quality","rollback_reason":"acceptance dropped from 1.94 to 1.74 and required 19 steps; decode TPS below 128-window best","next_action_hint":"128 remains better than smaller windows for this prompt; test 160/192 or move to non-window optimizations"}}
{"run":11,"commit":"3f125a4","metric":8.742524,"metrics":{"tps":8.742524,"e2e_tps":1.276935,"spec_sec":25.06,"gen_tokens":32,"steps":17,"committed":33,"step_ms":215.31,"pack_ms":1.34,"draft_ms":147.17,"topk_ms":18.84,"exact_ms":47.92,"exact_decode_ms":0,"acceptance":1.941},"status":"discard","description":"benchmark target feature window 160; tied 128 within noise but no improvement","timestamp":1777495276127,"segment":1,"confidence":4.609797945056553,"asi":{"hypothesis":"160-token window may reduce noise versus 128 while keeping acceptance and similar draft cost","rollback_reason":"decode TPS was effectively tied but below the 128-window best; no primary improvement","next_action_hint":"128/160 are close; choose 128 for lower memory/pack unless longer prompt validation favors 160"}}
{"run":12,"commit":"3f125a4","metric":8.341175,"metrics":{"tps":8.341175,"e2e_tps":1.263873,"spec_sec":25.319,"gen_tokens":32,"steps":17,"committed":33,"step_ms":225.67,"pack_ms":1.61,"draft_ms":155.69,"topk_ms":20.27,"exact_ms":48.07,"exact_decode_ms":0,"acceptance":1.941},"status":"discard","description":"benchmark target feature window 192; slower draft than 128/160","timestamp":1777495349316,"segment":1,"confidence":5.810668520414904,"asi":{"hypothesis":"192-token window may retain proposal quality while staying much cheaper than 256/512","rollback_reason":"acceptance matched 128 but draft/step time regressed, so primary TPS fell","next_action_hint":"do not tune larger windows for this prompt unless broader validation shows 128 harms acceptance"}}
{"run":13,"commit":"3f125a4","metric":8.477158,"metrics":{"tps":8.477158,"e2e_tps":1.282257,"spec_sec":24.956,"gen_tokens":32,"steps":17,"committed":33,"step_ms":222.05,"pack_ms":1.08,"draft_ms":153.39,"topk_ms":19.58,"exact_ms":47.97,"exact_decode_ms":0,"acceptance":1.941},"status":"discard","description":"default target feature window 128 plus smaller draft context","timestamp":1777495532959,"segment":1,"confidence":6.104510117006526,"asi":{"hypothesis":"make 128 the code default and size draft n_ctx to target feature window + block so server avoids oversized draft context","rollback_reason":"primary decode TPS was below the prior 128-window env best despite lower set_inputs; draft compute/noise dominated and source change did not improve best","next_action_hint":"revisit draft n_ctx sizing separately with repeated runs; the log showed draft n_ctx=256 and set_inputs ~0.7 ms vs ~4 ms, but step time was not better"}}
{"run":14,"commit":"3f125a4","metric":8.572515,"metrics":{"tps":8.572515,"e2e_tps":1.275053,"spec_sec":25.097,"gen_tokens":32,"steps":17,"committed":33,"step_ms":219.58,"pack_ms":1.08,"draft_ms":149.11,"topk_ms":20.98,"exact_ms":48.38,"exact_decode_ms":0,"acceptance":1.941},"status":"discard","description":"replace small-K heap top-k with fixed array scan","timestamp":1777495681069,"segment":1,"confidence":6.592190362045204,"asi":{"hypothesis":"avoid per-row heap allocation/maintenance in extract_top_k_logprobs for K=8 using fixed stack arrays","rollback_reason":"topk time increased to 20.98 ms and primary TPS remained below the 128-window best; std heap path is not the bottleneck or is better optimized","next_action_hint":"future top-k gains likely need GPU-side logits processing or avoiding logsumexp/scanning, not a small CPU data structure tweak"}}
{"run":15,"commit":"3f125a4","metric":8.708952,"metrics":{"tps":8.708952,"e2e_tps":1.275764,"spec_sec":25.083,"gen_tokens":32,"steps":17,"committed":33,"step_ms":216.14,"pack_ms":1.09,"draft_ms":151.15,"topk_ms":16.14,"exact_ms":47.73,"exact_decode_ms":0,"acceptance":1.941},"status":"discard","description":"benchmark budget 14 with 128-window; lower top-k cost but no TPS win","timestamp":1777495763704,"segment":1,"confidence":7.965359774029058,"asi":{"hypothesis":"budget 14 triggers cheaper K=1/argmax proposal extraction while preserving the same accepted chain on this prompt","rollback_reason":"topk dropped 20.8->16.1 ms but total step time did not beat the 128-window budget-22 best; likely draft/noise dominates","next_action_hint":"test budget 8/1 only if looking for lower overhead mode; budget 14 is close but not a primary improvement"}}
{"run":16,"commit":"3f125a4","metric":8.433481,"metrics":{"tps":8.433481,"e2e_tps":1.267578,"spec_sec":25.245,"gen_tokens":32,"steps":17,"committed":33,"step_ms":223.2,"pack_ms":1.08,"draft_ms":156.77,"topk_ms":17.67,"exact_ms":47.64,"exact_decode_ms":0,"acceptance":1.941},"status":"discard","description":"benchmark budget 8 with 128-window; slower than budget 14/22","timestamp":1777495844676,"segment":1,"confidence":8.4658857012172,"asi":{"hypothesis":"smaller budget may reduce proposal overhead without reducing accepted chain length in exact-validation mode","rollback_reason":"budget 8 had same step count but slower draft/topk aggregate than budget 14 and below best TPS","next_action_hint":"budget tuning is not the next major path; keep budget 22 or 14 and focus draft/exact decode"}}
{"run":17,"commit":"3f125a4","metric":8.123665,"metrics":{"tps":8.123665,"e2e_tps":1.582756,"spec_sec":26.536,"gen_tokens":42,"steps":24,"committed":42,"step_ms":215.42,"pack_ms":1.02,"draft_ms":150.09,"topk_ms":20.84,"exact_ms":43.43,"exact_decode_ms":0,"acceptance":1.75},"status":"discard","description":"confirm 128-window on longer gen request; generation ended at 42 tokens and TPS lower","timestamp":1777495921126,"segment":1,"confidence":7.969102337827434,"asi":{"hypothesis":"longer generation should reduce noise and confirm whether 128-window remains best beyond a 32-token sample","rollback_reason":"request stopped after 42 generated tokens and acceptance fell to 1.75; decode TPS below current best","next_action_hint":"use a non-EOS prompt or different fixture for long-generation confirmation; current rendered prompt is not ideal for 64-token stability"}}
{"run":18,"commit":"c2c95e3","metric":9.112863,"metrics":{"tps":9.112863,"e2e_tps":1.2838,"spec_sec":24.926,"gen_tokens":32,"steps":17,"committed":33,"step_ms":206.56,"pack_ms":1.06,"draft_ms":137.65,"topk_ms":19.8,"exact_ms":48.01,"exact_decode_ms":0,"acceptance":1.941},"status":"keep","description":"change DDTree default target feature window to 128","timestamp":1777496003711,"segment":1,"confidence":8.71562209789579,"asi":{"hypothesis":"make the observed 128-token target feature window the default so server/e2e use the faster draft context without requiring an env override","result":"default 128 produced the best run so far: 9.11 decode TPS, draft 137.65 ms, pack 1.06 ms, acceptance unchanged at 1.94","next_action_hint":"validate on additional prompts before treating 128 as globally safe; env knob still allows raising to 512/1024 if acceptance drops"}}
{"run":19,"commit":"c2c95e3","metric":8.611341,"metrics":{"tps":8.611341,"e2e_tps":1.273784,"spec_sec":25.122,"gen_tokens":32,"steps":17,"committed":33,"step_ms":218.59,"pack_ms":1.07,"draft_ms":149.86,"topk_ms":19.62,"exact_ms":48.01,"exact_decode_ms":0,"acceptance":1.941},"status":"discard","description":"repeat default 128-window run for noise check","timestamp":1777496086262,"segment":1,"confidence":9.96049456675487,"asi":{"hypothesis":"re-run the kept 128 default to estimate noise and verify improvement is not a one-off","rollback_reason":"repeat was below the 9.11 best but still far above the original 1024-window baseline; no new code changes to keep","next_action_hint":"treat 128 default as real but noisy; future comparisons should beat about 8.6-9.1 decode TPS"}}
{"run":20,"commit":"c2c95e3","metric":8.819946,"metrics":{"tps":8.819946,"e2e_tps":1.273885,"spec_sec":25.12,"gen_tokens":32,"steps":17,"committed":33,"step_ms":213.42,"pack_ms":1.09,"draft_ms":144.99,"topk_ms":19.36,"exact_ms":47.94,"exact_decode_ms":0,"acceptance":1.941},"status":"discard","description":"disable DDTree micro-profile logging during benchmark","timestamp":1777496178429,"segment":1,"confidence":10.150139806908188,"asi":{"hypothesis":"LLAMA_DDTREE_PROFILE logging may add per-step overhead, so production benchmarking should disable it","rollback_reason":"no-profile run did not beat the kept 9.11 TPS best, though it remained above baseline","next_action_hint":"keep profiling disabled for user-facing server unless collecting diagnostics; script knob is preserved in autoresearch.sh"}}
{"run":21,"commit":"c2c95e3","metric":8.471256,"metrics":{"tps":8.471256,"e2e_tps":1.271052,"spec_sec":25.176,"gen_tokens":32,"steps":18,"committed":33,"step_ms":209.86,"pack_ms":1.08,"draft_ms":151.2,"topk_ms":12.38,"exact_ms":45.18,"exact_decode_ms":0,"acceptance":1.833},"status":"discard","description":"diagnostic dynamic DDTree block size 8","timestamp":1777496331268,"segment":1,"confidence":10.70842957726881,"asi":{"hypothesis":"smaller draft block size may reduce draft/top-k compute enough to offset lower speculative horizon","rollback_reason":"block size 8 reduced topk but acceptance dropped and an extra step was needed; primary TPS below current 128-window default","next_action_hint":"keep block size 16; reducing block size does not help this prompt"}}
{"run":22,"commit":"c2c95e3","metric":8.249464,"metrics":{"tps":8.249464,"e2e_tps":1.263823,"spec_sec":25.32,"gen_tokens":32,"steps":19,"committed":33,"step_ms":204.16,"pack_ms":0.23,"draft_ms":141.61,"topk_ms":19.45,"exact_ms":42.84,"exact_decode_ms":0,"acceptance":1.737},"status":"discard","description":"benchmark target feature window 32; too little context hurts acceptance","timestamp":1777496404861,"segment":1,"confidence":11.331709410339274,"asi":{"hypothesis":"very small target feature window may lower draft compute enough to compensate for weaker proposals","rollback_reason":"acceptance dropped to 1.74 and 19 steps were needed, so TPS stayed below the 128 default","next_action_hint":"do not go below 128 for this workload unless a broader prompt shows different behavior"}}
{"run":23,"commit":"c2c95e3","metric":8.67164,"metrics":{"tps":8.67164,"e2e_tps":1.279284,"spec_sec":25.014,"gen_tokens":32,"steps":17,"committed":33,"step_ms":217.07,"pack_ms":1.07,"draft_ms":151.99,"topk_ms":16.08,"exact_ms":47.9,"exact_decode_ms":0,"acceptance":1.941},"status":"discard","description":"use K=1 proposal extraction for default exact-validation path","timestamp":1777496508196,"segment":1,"confidence":11.723927501915169,"asi":{"hypothesis":"when batched tree verify is disabled, default exact validation can use a top-1 draft chain instead of K=8 tree proposals to reduce top-k overhead while remaining correct","rollback_reason":"topk improved by ~3.7 ms but total step time stayed below the 128-window best because draft compute/noise dominated","next_action_hint":"K=1 is a possible conservative mode but not a net win in this benchmark; do not keep as default"}}
{"run":24,"commit":"c2c95e3","metric":8.675637,"metrics":{"tps":8.675637,"e2e_tps":1.271102,"spec_sec":25.175,"gen_tokens":32,"steps":17,"committed":33,"step_ms":216.97,"pack_ms":1.06,"draft_ms":148.41,"topk_ms":19.69,"exact_ms":47.77,"exact_decode_ms":0,"acceptance":1.941},"status":"discard","description":"cache DFlash mask token embedding in DDTree driver","timestamp":1777496601681,"segment":1,"confidence":11.829481672771372,"asi":{"hypothesis":"avoid per-step mask embedding lookup/allocation when building draft noise embeddings","rollback_reason":"change was correct but primary TPS did not beat the 128-window best; mask embedding work is not a measurable bottleneck","next_action_hint":"focus on draft graph compute or exact target decode rather than small CPU setup around noise embeddings"}}
{"run":25,"commit":"c2c95e3","metric":0,"metrics":{"tps":0,"e2e_tps":0,"spec_sec":21.707,"gen_tokens":0,"steps":1,"committed":0,"step_ms":0,"pack_ms":3.58,"draft_ms":152.52,"topk_ms":18.22,"exact_ms":0,"exact_decode_ms":0,"acceptance":0},"status":"crash","description":"fast-batched fast-rollback at 128-window still OOMs persist on 64k full-draft","timestamp":1777496697475,"segment":1,"confidence":11.829481672771372,"asi":{"hypothesis":"smaller target feature window might free enough memory for fast rollback persist with -ngl65 -ngld6 -c65536","rollback_reason":"persist allocation still failed needing 1707.75 MiB; with SNAPSHOT_FALLBACK=0 driver returned empty output","next_action_hint":"fast rollback remains nonviable for full-draft 64k on this 24GB GPU unless memory is freed elsewhere; test q4/tq3 or reduced draft offload only if willing to trade draft speed"}}
{"run":26,"commit":"c2c95e3","metric":0,"metrics":{"tps":0,"e2e_tps":0,"spec_sec":0,"gen_tokens":0,"steps":0,"committed":0,"step_ms":0,"pack_ms":0,"draft_ms":0,"topk_ms":0,"exact_ms":0,"exact_decode_ms":0,"acceptance":0},"status":"crash","description":"try tq3 KV to fit fast rollback persist; e2e harness does not accept tq3_0","timestamp":1777496743075,"segment":1,"confidence":11.829481672771372,"asi":{"hypothesis":"TQ3 KV might free enough VRAM for fast rollback persist with full target/draft offload","rollback_reason":"test-speculative-tree-e2e only accepts its hardcoded KV type set and rejected --kv-type tq3_0","next_action_hint":"add tq3_0 support to e2e parser before testing this; do not infer runtime viability from this crash"}}
{"run":27,"commit":"c2c95e3","metric":0,"metrics":{"tps":0,"e2e_tps":0,"spec_sec":26.717,"gen_tokens":0,"steps":1,"committed":0,"step_ms":0,"pack_ms":3.47,"draft_ms":153,"topk_ms":22.52,"exact_ms":0,"exact_decode_ms":0,"acceptance":0},"status":"crash","description":"add tq3_0 e2e parser support and retry fast rollback; persist still OOMs","timestamp":1777496853671,"segment":1,"confidence":11.829481672771372,"asi":{"hypothesis":"with e2e tq3_0 support, smaller KV cache may fit the 1.7 GiB fast rollback persist buffer","rollback_reason":"tq3_0 run still failed CUDA persist allocation for 1707.75 MiB and returned empty output with snapshot fallback disabled","next_action_hint":"fast rollback requires a larger memory reduction than q4->tq3 KV provides in this harness; avoid more persist-fit tests unless freeing draft layers or compressing persist"}}
{"run":28,"commit":"c2c95e3","metric":0,"metrics":{"tps":0,"e2e_tps":0,"spec_sec":21.704,"gen_tokens":0,"steps":1,"committed":0,"step_ms":0,"pack_ms":3.52,"draft_ms":279.6,"topk_ms":16.26,"exact_ms":0,"exact_decode_ms":0,"acceptance":0},"status":"crash","description":"fast rollback with draft GPU layers 4 still OOMs persist","timestamp":1777496939181,"segment":1,"confidence":11.829481672771372,"asi":{"hypothesis":"offloading fewer draft layers may free VRAM for fast rollback persist while retaining enough draft speed","rollback_reason":"ngld4 still failed 1707.75 MiB persist allocation and draft time already rose to 280 ms","next_action_hint":"if testing persist-fit, drop to ngld3 or lower, but expected draft slowdown likely outweighs rollback savings"}}
{"run":29,"commit":"c2c95e3","metric":4.278956,"metrics":{"tps":4.278956,"e2e_tps":1.114323,"spec_sec":28.717,"gen_tokens":32,"steps":18,"committed":33,"step_ms":415.47,"pack_ms":1.09,"draft_ms":294.24,"topk_ms":17.17,"exact_ms":0,"exact_decode_ms":0,"acceptance":1.833},"status":"discard","description":"fast rollback with draft GPU layers 3 fits but is slower than exact path","timestamp":1777497015792,"segment":1,"confidence":11.723927501915169,"asi":{"hypothesis":"ngld3 may free enough VRAM for fast rollback and remove exact decode cost","rollback_reason":"persist fit and exact cost dropped to zero, but draft slowed to 294 ms and target_tree/rollback path made step 415 ms; TPS far below exact default","next_action_hint":"do not trade draft GPU layers for fast rollback on 64k; full draft GPU exact path is better"}}
{"run":30,"commit":"c2c95e3","metric":8.740901,"metrics":{"tps":8.740901,"e2e_tps":1.289127,"spec_sec":24.823,"gen_tokens":32,"steps":17,"committed":33,"step_ms":215.35,"pack_ms":1.07,"draft_ms":147.03,"topk_ms":19.45,"exact_ms":47.77,"exact_decode_ms":0,"acceptance":1.941},"status":"discard","description":"benchmark 32k context for 20k prompt with 128 default","timestamp":1777497087757,"segment":1,"confidence":11.620240385859825,"asi":{"hypothesis":"if the prompt fits under 32k, lower target context may reduce memory/compute overhead versus 64k","rollback_reason":"32k context was above baseline but below the 64k 128-window best; no primary improvement in this e2e workload","next_action_hint":"context reduction may still help server prefill/memory, but decode step optimization should focus elsewhere"}}
{"run":31,"commit":"c2c95e3","metric":8.538297,"metrics":{"tps":8.538297,"e2e_tps":1.284779,"spec_sec":24.907,"gen_tokens":32,"steps":17,"committed":33,"step_ms":220.46,"pack_ms":1.09,"draft_ms":151.06,"topk_ms":20.14,"exact_ms":48.13,"exact_decode_ms":0,"acceptance":1.941},"status":"discard","description":"benchmark 24k context for 20k prompt","timestamp":1777497161415,"segment":1,"confidence":11.723927501915169,"asi":{"hypothesis":"context just above prompt length may reduce decode overhead further than 32k/64k","rollback_reason":"decode TPS below current best; smaller n_ctx did not improve step time on this harness","next_action_hint":"do not spend more iterations on n_ctx for decode TPS; use n_ctx tuning only for memory/server fit"}}
{"run":32,"commit":"c2c95e3","metric":0,"metrics":{"tps":0,"e2e_tps":0,"spec_sec":21.44,"gen_tokens":0,"steps":1,"committed":0,"step_ms":0,"pack_ms":3.54,"draft_ms":146.12,"topk_ms":20.51,"exact_ms":0,"exact_decode_ms":0,"acceptance":0},"status":"crash","description":"smaller draft context plus full-draft fast rollback still OOMs persist","timestamp":1777497248234,"segment":1,"confidence":11.723927501915169,"asi":{"hypothesis":"sizing draft n_ctx to the 128 target-feature window may free enough compute-buffer VRAM for full-draft fast rollback","rollback_reason":"fast rollback still failed persist allocation and returned empty output with snapshot fallback disabled","next_action_hint":"draft context sizing alone does not free the required 1.7 GiB; persist compression is needed for this route"}}
{"run":33,"commit":"c2c95e3","metric":8.497057,"metrics":{"tps":8.497057,"e2e_tps":1.266474,"spec_sec":25.267,"gen_tokens":32,"steps":17,"committed":33,"step_ms":221.53,"pack_ms":1.07,"draft_ms":153.17,"topk_ms":19.25,"exact_ms":48.01,"exact_decode_ms":0,"acceptance":1.941},"status":"discard","description":"skip exact validation seq_rm diagnostic","timestamp":1777497395516,"segment":1,"confidence":11.854680691870985,"asi":{"hypothesis":"default exact chain validation may be clearing an already-empty future range every step; skipping it could save overhead","rollback_reason":"correctness passed but primary TPS dropped; seq_rm is not a meaningful bottleneck","next_action_hint":"keep the safety clear in exact validation"}}
{"run":34,"commit":"c2c95e3","metric":2.679353,"metrics":{"tps":2.679353,"e2e_tps":0.960384,"spec_sec":33.32,"gen_tokens":32,"steps":17,"committed":33,"step_ms":702.54,"pack_ms":1.11,"draft_ms":160.23,"topk_ms":19.65,"exact_ms":0,"exact_decode_ms":0,"acceptance":1.941},"status":"discard","description":"fast-batched with snapshot/replay fallback under 128 default","timestamp":1777497492163,"segment":1,"confidence":11.723927501915169,"asi":{"hypothesis":"trusting batched posterior with snapshot fallback might avoid exact decode while preserving correctness","rollback_reason":"snapshot/tree/replay path made step time 702 ms, far slower than exact chain default; fast-batched without persist rollback is not viable","next_action_hint":"only revisit fast-batched with a no-OOM fast rollback or conditional exact rule; snapshot fallback is too expensive"}}
{"run":35,"commit":"c2c95e3","metric":8.402611,"metrics":{"tps":8.402611,"e2e_tps":1.269942,"spec_sec":25.198,"gen_tokens":32,"steps":17,"committed":33,"step_ms":224.02,"pack_ms":1.09,"draft_ms":154.23,"topk_ms":20.73,"exact_ms":47.95,"exact_decode_ms":0,"acceptance":1.941},"status":"discard","description":"benchmark n_batch 2048 with 512 ubatch","timestamp":1777497577065,"segment":1,"confidence":11.829481672771372,"asi":{"hypothesis":"larger prompt batch may improve prefill or reserve behavior without changing decode acceptance","rollback_reason":"decode step time worsened; n_batch tuning is not a decode TPS improvement","next_action_hint":"keep n_batch=512 for this e2e benchmark unless optimizing prompt prefill separately"}}
{"run":36,"commit":"c2c95e3","metric":8.610159,"metrics":{"tps":8.610159,"e2e_tps":1.233046,"spec_sec":25.952,"gen_tokens":32,"steps":17,"committed":33,"step_ms":218.62,"pack_ms":1.06,"draft_ms":149.81,"topk_ms":19.88,"exact_ms":47.83,"exact_decode_ms":0,"acceptance":1.941},"status":"discard","description":"benchmark n_ubatch 256","timestamp":1777497653606,"segment":1,"confidence":12.65478788505223,"asi":{"hypothesis":"smaller ubatch may reduce memory pressure or improve graph reserve for decode","rollback_reason":"primary TPS below best and e2e time worsened due slower prompt processing; no decode win","next_action_hint":"do not tune ubatch for current decode target"}}
{"run":37,"commit":"c2c95e3","metric":8.817054,"metrics":{"tps":8.817054,"e2e_tps":1.209327,"spec_sec":26.461,"gen_tokens":32,"steps":17,"committed":33,"step_ms":213.49,"pack_ms":1.09,"draft_ms":145.34,"topk_ms":18.88,"exact_ms":48.15,"exact_decode_ms":47.76,"acceptance":1.941},"status":"discard","description":"print exact validation timing split in e2e harness","timestamp":1777497756241,"segment":1,"confidence":11.854680691870985,"asi":{"hypothesis":"surface exact_decode/exact_sample timing in e2e to guide next optimization decisions","rollback_reason":"instrumentation did not improve primary TPS and was reverted by experiment rules, though it confirmed exact_decode is essentially all exact cost","next_action_hint":"if more diagnostics are needed, re-add exact split intentionally; current useful fact: exact_decode 47.76 ms of exact 48.15 ms"}}
{"run":38,"commit":"c2c95e3","metric":7.97639,"metrics":{"tps":7.97639,"e2e_tps":1.251418,"spec_sec":25.571,"gen_tokens":32,"steps":18,"committed":33,"step_ms":222.88,"pack_ms":0.94,"draft_ms":156.66,"topk_ms":19.89,"exact_ms":45.36,"exact_decode_ms":0,"acceptance":1.833},"status":"discard","description":"benchmark target feature window 112","timestamp":1777497962865,"segment":1,"confidence":11.723927501915169,"asi":{"hypothesis":"112-token window might reduce draft context below 128 without dropping acceptance","rollback_reason":"acceptance dropped to 1.83 and an extra step was needed; 128 remains better","next_action_hint":"avoid target feature windows below 128 on this prompt"}}
{"run":39,"commit":"c2c95e3","metric":8.508964,"metrics":{"tps":8.508964,"e2e_tps":1.272163,"spec_sec":25.154,"gen_tokens":32,"steps":17,"committed":33,"step_ms":221.22,"pack_ms":1.21,"draft_ms":151.9,"topk_ms":20.01,"exact_ms":48.07,"exact_decode_ms":0,"acceptance":1.941},"status":"discard","description":"benchmark target feature window 144","timestamp":1777498045067,"segment":1,"confidence":11.854680691870985,"asi":{"hypothesis":"144-token window may be a safer slightly larger default than 128 with similar acceptance","rollback_reason":"acceptance matched 128 but draft/step time was worse and primary TPS below best","next_action_hint":"128 is still the best target feature default among tested windows"}}
{"run":40,"commit":"c2c95e3","metric":0,"metrics":{"tps":0,"e2e_tps":0,"spec_sec":33.226,"gen_tokens":32,"steps":32,"committed":32,"step_ms":212.78,"pack_ms":1.02,"draft_ms":160.08,"topk_ms":21.43,"exact_ms":30.22,"exact_decode_ms":0,"acceptance":1},"status":"crash","description":"tq3_0 KV exact default diverges from chain in e2e","timestamp":1777498174420,"segment":1,"confidence":11.854680691870985,"asi":{"hypothesis":"TQ3 KV may reduce memory or improve speed in exact default mode after adding e2e parser support","rollback_reason":"correctness failed at first generated token: chain[0]=8635, spec[0]=248069; TQ3 is not a safe drop-in for this correctness gate","next_action_hint":"do not use tq3_0 for DDTree correctness until target-only chain/spec equivalence is debugged"}}
{"run":41,"commit":"c2c95e3","metric":0,"metrics":{"tps":0,"e2e_tps":0,"spec_sec":0,"gen_tokens":0,"steps":0,"committed":0,"step_ms":0,"pack_ms":0,"draft_ms":0,"topk_ms":0,"exact_ms":0,"exact_decode_ms":0,"acceptance":0},"status":"crash","description":"q8 KV exact default OOMs with full draft at 64k","timestamp":1777498231508,"segment":1,"confidence":11.854680691870985,"asi":{"hypothesis":"q8 KV might speed exact decode enough to offset larger memory footprint","rollback_reason":"q8_0 KV caused draft context compute buffer allocation OOM with -ngl65 -ngld6 -c65536","next_action_hint":"q4_0 remains the viable full-draft 64k KV type on this GPU"}}
{"run":42,"commit":"c2c95e3","metric":9.049339,"metrics":{"tps":9.049339,"e2e_tps":1.28726,"spec_sec":24.859,"gen_tokens":32,"steps":17,"committed":33,"step_ms":208.01,"pack_ms":1.07,"draft_ms":140.46,"topk_ms":19.16,"exact_ms":47.28,"exact_decode_ms":0,"acceptance":1.941},"status":"discard","description":"q8 KV with 32k context fits and nearly matches best","timestamp":1777498306731,"segment":1,"confidence":11.723927501915169,"asi":{"hypothesis":"for prompts fitting 32k, q8 KV may fit and speed decode compared with q4 at 64k","rollback_reason":"q8_0 + 32k was close but still below the 64k q4 best run, so no primary improvement","next_action_hint":"q8_0 + 32k is a viable alternative for <=32k prompts; retest with server/prompt-prefill metric if memory quality matters"}}
{"run":43,"commit":"c2c95e3","metric":8.663259,"metrics":{"tps":8.663259,"e2e_tps":1.277853,"spec_sec":25.042,"gen_tokens":32,"steps":17,"committed":33,"step_ms":217.28,"pack_ms":1.07,"draft_ms":148.82,"topk_ms":19.59,"exact_ms":47.76,"exact_decode_ms":0,"acceptance":1.941},"status":"discard","description":"repeat q8 KV with 32k context","timestamp":1777498380449,"segment":1,"confidence":12.711493900280791,"asi":{"hypothesis":"confirm whether q8_0 + 32k close-to-best run was stable","rollback_reason":"repeat fell to 8.66 TPS, confirming the 9.05 run was noise/near-best but not better than 64k q4 default","next_action_hint":"do not switch primary config to q8_0 + 32k for decode TPS"}}
{"run":44,"commit":"c2c95e3","metric":8.561987,"metrics":{"tps":8.561987,"e2e_tps":1.273987,"spec_sec":25.118,"gen_tokens":32,"steps":17,"committed":33,"step_ms":219.85,"pack_ms":1.04,"draft_ms":150.19,"topk_ms":20.4,"exact_ms":48.2,"exact_decode_ms":0,"acceptance":1.941},"status":"discard","description":"benchmark target feature window 124","timestamp":1777498457519,"segment":1,"confidence":14.22995076181824,"asi":{"hypothesis":"124 may be just below 128 while preserving acceptance","rollback_reason":"acceptance preserved but step time was worse than 128 default; no primary improvement","next_action_hint":"stop fine-tuning target feature window around 128"}}
{"run":45,"commit":"c2c95e3","metric":5.277915,"metrics":{"tps":5.277915,"e2e_tps":1.165883,"spec_sec":27.447,"gen_tokens":32,"steps":30,"committed":32,"step_ms":202.1,"pack_ms":1,"draft_ms":153.89,"topk_ms":20.42,"exact_ms":26.76,"exact_decode_ms":0,"acceptance":1.067},"status":"discard","description":"do not request draft logits for root slot","timestamp":1777498552553,"segment":1,"confidence":12.711493900280791,"asi":{"hypothesis":"skip unused root-position draft logits to reduce lm_head/output work by one row per draft step","rollback_reason":"correctness remained bit-equal only because exact validation fell back, but draft proposal quality collapsed: acceptance 1.94->1.07 and steps 17->30; logits row indexing/output semantics do not support this simple change","next_action_hint":"do not skip root logits without first understanding llama_get_logits_ith output indexing for sparse logits flags"}}
{"run":46,"commit":"c2c95e3","metric":8.607403,"metrics":{"tps":8.607403,"e2e_tps":1.272113,"spec_sec":25.155,"gen_tokens":32,"steps":17,"committed":33,"step_ms":218.69,"pack_ms":1.07,"draft_ms":148.88,"topk_ms":20.44,"exact_ms":48.27,"exact_decode_ms":0,"acceptance":1.941},"status":"discard","description":"suppress root logits flag but keep original logits row indexing","timestamp":1777498637976,"segment":1,"confidence":14.22995076181824,"asi":{"hypothesis":"root logits can be disabled while keeping llama_get_logits_ith(1) indexing, reducing output work without changing proposals","rollback_reason":"proposal quality stayed intact but draft/topk/step time did not improve enough; primary below best","next_action_hint":"root logits flag is not a useful optimization in current llama.cpp output path"}}
{"run":47,"commit":"c2c95e3","metric":0,"metrics":{"tps":0,"e2e_tps":0,"spec_sec":0,"gen_tokens":0,"steps":0,"committed":0,"step_ms":0,"pack_ms":0,"draft_ms":0,"topk_ms":0,"exact_ms":0,"exact_decode_ms":0,"acceptance":0},"status":"crash","description":"disable Flash Attention diagnostic with q4 KV","timestamp":1777498814939,"segment":1,"confidence":14.22995076181824,"asi":{"hypothesis":"small one-token exact decode and draft attention might be faster without Flash Attention overhead","rollback_reason":"q4_0 V cache quantization requires flash_attn, so target context creation failed before benchmark","next_action_hint":"only test no-flash with f16 KV if memory allows; q4/q8 quantized V requires Flash Attention"}}
{"run":48,"commit":"c2c95e3","metric":0,"metrics":{"tps":0,"e2e_tps":0,"spec_sec":0,"gen_tokens":0,"steps":0,"committed":0,"step_ms":0,"pack_ms":0,"draft_ms":0,"topk_ms":0,"exact_ms":0,"exact_decode_ms":0,"acceptance":0},"status":"crash","description":"no-flash with f16 KV at 32k OOMs target compute buffer","timestamp":1777498859775,"segment":1,"confidence":14.22995076181824,"asi":{"hypothesis":"use f16 KV at 32k to allow no-flash attention and test whether one-token decode is faster without Flash Attention","rollback_reason":"context creation failed allocating 1739 MiB CUDA compute buffer; no benchmark ran","next_action_hint":"no-flash is not viable in current full-offload/full-draft memory envelope; abandon this path"}}
{"run":49,"commit":"c2c95e3","metric":8.674837,"metrics":{"tps":8.674837,"e2e_tps":1.274088,"spec_sec":25.116,"gen_tokens":32,"steps":17,"committed":33,"step_ms":216.99,"pack_ms":1.08,"draft_ms":146.47,"topk_ms":20.99,"exact_ms":48.42,"exact_decode_ms":0,"acceptance":1.941},"status":"discard","description":"force chain recurrent kernel diagnostic","timestamp":1777498962837,"segment":1,"confidence":16.05291169266154,"asi":{"hypothesis":"forcing chain recurrent kernels might reduce exact one-token decode overhead or avoid tree-kernel dispatch cost in exact validation","rollback_reason":"correctness passed but step time stayed below best; exact remained about 48 ms and topk worsened","next_action_hint":"leave chain-kernel force unset for normal runs"}}
{"run":50,"commit":"c2c95e3","metric":8.605435,"metrics":{"tps":8.605435,"e2e_tps":1.275154,"spec_sec":25.095,"gen_tokens":32,"steps":17,"committed":33,"step_ms":218.74,"pack_ms":0.73,"draft_ms":150.02,"topk_ms":19.64,"exact_ms":48.32,"exact_decode_ms":0,"acceptance":1.941},"status":"discard","description":"bulk-copy contiguous spans when packing target feature ring","timestamp":1777499134443,"segment":1,"confidence":16.106475474044874,"asi":{"hypothesis":"replace per-column target_feat ring packing with one or two bulk memcpy calls to reduce the remaining pack/upload overhead without changing model behavior","rollback_reason":"pack time improved from about 1.06 ms to 0.73 ms, but total step time and primary TPS stayed below the 128-window best; pack is now too small to move throughput alone","next_action_hint":"do not spend more iterations on CPU ring packing until larger draft/exact decode bottlenecks are addressed"}}
{"run":51,"commit":"c2c95e3","metric":8.567053,"metrics":{"tps":8.567053,"e2e_tps":1.269287,"spec_sec":25.211,"gen_tokens":32,"steps":17,"committed":33,"step_ms":219.72,"pack_ms":1.09,"draft_ms":151.19,"topk_ms":19.48,"exact_ms":47.93,"exact_decode_ms":0,"acceptance":1.941},"status":"discard","description":"allow DFlash target_feat graph input to reuse draft graph","timestamp":1777499394932,"segment":1,"confidence":18.011796096054017,"asi":{"hypothesis":"DFlash draft graph was not reused because target_feat input lacked can_reuse; allowing reuse should stabilize CUDA graph/build and reduce draft overhead without changing math","rollback_reason":"graph reuse worked after the first step (reused=1, build_alloc=0) but primary TPS and draft_ms were worse than the 128-window best; graph rebuild cost was not a bottleneck","next_action_hint":"do not prioritize graph reuse; focus on actual draft compute (fc/attention/lm_head) or exact target decode"}}
{"run":52,"commit":"c2c95e3","metric":0,"metrics":{"tps":0,"e2e_tps":0,"spec_sec":21.679,"gen_tokens":0,"steps":1,"committed":0,"step_ms":0,"pack_ms":3.62,"draft_ms":154.37,"topk_ms":18.85,"exact_ms":0,"exact_decode_ms":0,"acceptance":0},"status":"crash","description":"fast rollback with budget 14 to reduce persist footprint","timestamp":1777499530631,"segment":1,"confidence":18.011796096054017,"asi":{"hypothesis":"lowering DDTree budget from 22 to 14 reduces persist allocation from about 1.7 GiB to about 1.1 GiB and may allow full-draft fast rollback at 64k","rollback_reason":"persist allocation still OOMed at 1086.75 MiB; driver produced empty output with snapshot fallback disabled","next_action_hint":"full-draft fast rollback needs more than budget reduction; try lower target/draft compute buffers only if expected target_tree cost can still beat exact chain"}}
{"run":53,"commit":"c2c95e3","metric":7.570506,"metrics":{"tps":7.570506,"e2e_tps":1.246834,"spec_sec":25.665,"gen_tokens":32,"steps":19,"committed":34,"step_ms":222.47,"pack_ms":1.05,"draft_ms":154.78,"topk_ms":22.6,"exact_ms":43.99,"exact_decode_ms":0,"acceptance":1.789},"status":"discard","description":"diagnostic larger draft block size 24","timestamp":1777499663146,"segment":1,"confidence":16.106475474044874,"asi":{"hypothesis":"increasing DDTree draft block size beyond the trained/default 16 may expose a longer proposal horizon and reduce exact validation steps without changing correctness because exact validation remains authoritative","rollback_reason":"block size 24 passed bit-equal correctness but acceptance dropped to 1.79, steps rose to 19, and topk/draft costs increased; primary TPS below best","next_action_hint":"do not increase DFlash block size above 16 for this model; the draft quality beyond the default horizon is worse"}}
{"run":54,"commit":"abe969d","metric":9.820771,"metrics":{"tps":9.820771,"e2e_tps":1.295075,"spec_sec":24.709,"gen_tokens":32,"steps":16,"committed":33,"step_ms":203.65,"pack_ms":1.08,"draft_ms":134.43,"topk_ms":17.84,"exact_ms":50.27,"exact_decode_ms":0,"acceptance":2.062},"status":"keep","description":"increase DDTree budget to 32 for exact-validation path","timestamp":1777499764379,"segment":1,"confidence":19.93149922357974,"asi":{"hypothesis":"larger DDTree budget may include more alternative draft branches so exact chain validation can accept farther before falling off the proposal tree; top-k cost should stay similar because K remains 8","result":"budget 32 improved decode TPS to 9.82, reduced steps 17->16, and raised acceptance to 2.06 despite exact_ms rising per step","next_action_hint":"confirm with repeat and test budget 40/48; watch for prompt overfit because larger budgets may help only when alternate branches match the exact path"}}
{"run":55,"commit":"abe969d","metric":9.003331,"metrics":{"tps":9.003331,"e2e_tps":1.282822,"spec_sec":24.945,"gen_tokens":32,"steps":16,"committed":33,"step_ms":222.14,"pack_ms":1.08,"draft_ms":150.07,"topk_ms":19.8,"exact_ms":51.16,"exact_decode_ms":0,"acceptance":2.062},"status":"discard","description":"test DDTree budget 40 after budget 32 improvement","timestamp":1777499863976,"segment":1,"confidence":19.318235385991656,"asi":{"hypothesis":"budget 40 may further improve proposal coverage beyond budget 32 while keeping the same number of exact-validation steps","rollback_reason":"acceptance matched budget 32 but step time regressed to 222 ms, especially draft/topk, so primary TPS fell below the budget-32 best","next_action_hint":"budget 32 appears better than 40 on this prompt; test budget 28/36 or repeat 32 for noise before changing server defaults further"}}
{"run":56,"commit":"abe969d","metric":9.144111,"metrics":{"tps":9.144111,"e2e_tps":1.287467,"spec_sec":24.855,"gen_tokens":32,"steps":16,"committed":33,"step_ms":218.72,"pack_ms":1.09,"draft_ms":145.96,"topk_ms":20.45,"exact_ms":51.19,"exact_decode_ms":0,"acceptance":2.062},"status":"discard","description":"test DDTree budget 28 below budget-32 best","timestamp":1777499961123,"segment":1,"confidence":19.01142448327141,"asi":{"hypothesis":"budget 28 may retain the step-count win from budget 32 while reducing tree/proposal overhead","rollback_reason":"steps and acceptance matched budget 32, but step time was slower and primary TPS remained below 9.82","next_action_hint":"budget 32 remains the best tested budget; test 30/34 or repeat 32 to confirm noise"}}
{"run":57,"commit":"abe969d","metric":8.99928,"metrics":{"tps":8.99928,"e2e_tps":1.257022,"spec_sec":25.457,"gen_tokens":32,"steps":16,"committed":33,"step_ms":222.24,"pack_ms":1.08,"draft_ms":149.56,"topk_ms":20.23,"exact_ms":51.33,"exact_decode_ms":0,"acceptance":2.062},"status":"discard","description":"repeat DDTree budget 32 to confirm previous best against noise","timestamp":1777500057910,"segment":1,"confidence":19.196128504310348,"asi":{"hypothesis":"repeat budget 32 because the previous 9.82 TPS improvement may include noise; confirm whether the step-count/acceptance gain is stable","rollback_reason":"repeat preserved the 16-step and 2.06 acceptance behavior but step time regressed to 222 ms, so primary TPS was below the kept best","next_action_hint":"budget 32 still looks structurally useful via fewer steps, but compare future ideas against a noisy 9.0-9.8 TPS range rather than a single best run"}}
{"run":58,"commit":"abe969d","metric":9.0151,"metrics":{"tps":9.0151,"e2e_tps":1.280307,"spec_sec":24.994,"gen_tokens":32,"steps":16,"committed":33,"step_ms":221.85,"pack_ms":1.07,"draft_ms":148.88,"topk_ms":20.24,"exact_ms":51.62,"exact_decode_ms":0,"acceptance":2.062},"status":"discard","description":"test DDTree budget 30 between budget 28 and 32","timestamp":1777500149659,"segment":1,"confidence":19.077213862263655,"asi":{"hypothesis":"budget 30 may keep the 16-step acceptance gain from budget 32 while trimming proposal overhead versus larger budgets","rollback_reason":"budget 30 matched the 16-step/2.06 acceptance pattern but step time stayed around 222 ms and primary TPS remained below the kept budget-32 best","next_action_hint":"budget 30/28/40 do not beat budget 32; try budget 34/36 or switch away from budget tuning if repeats remain noisy"}}
{"run":59,"commit":"abe969d","metric":9.208951,"metrics":{"tps":9.208951,"e2e_tps":1.286587,"spec_sec":24.872,"gen_tokens":32,"steps":16,"committed":33,"step_ms":217.18,"pack_ms":1.1,"draft_ms":144.66,"topk_ms":19.97,"exact_ms":51.42,"exact_decode_ms":0,"acceptance":2.062},"status":"discard","description":"test DDTree budget 36 above budget-32 best","timestamp":1777500249227,"segment":1,"confidence":18.755483015403495,"asi":{"hypothesis":"budget 36 may preserve the 16-step acceptance gain from budget 32 while giving slightly better branch coverage than 30/32 without the overhead seen at 40","rollback_reason":"budget 36 passed and was better than 28/30/40, but still below the kept budget-32 best of 9.82 TPS","next_action_hint":"budget 34 is the last nearby budget worth testing; otherwise budget tuning appears saturated and noisy around 9.0-9.8 TPS"}}
{"run":60,"commit":"abe969d","metric":9.048545,"metrics":{"tps":9.048545,"e2e_tps":1.250684,"spec_sec":25.586,"gen_tokens":32,"steps":16,"committed":33,"step_ms":221.03,"pack_ms":1.08,"draft_ms":147.2,"topk_ms":21.15,"exact_ms":51.57,"exact_decode_ms":0,"acceptance":2.062},"status":"discard","description":"test DDTree budget 34 near budget-32 best","timestamp":1777500350130,"segment":1,"confidence":18.510168209448402,"asi":{"hypothesis":"budget 34 may retain the 16-step acceptance improvement from budget 32 while finding a lower-overhead point than 36/40","rollback_reason":"budget 34 matched the 16-step and 2.06 acceptance pattern, but topk/step time were worse and primary TPS remained below the kept budget-32 best","next_action_hint":"stop local budget sweep around 32; nearby budgets 28/30/34/36/40 all failed to beat 32, so switch to a different bottleneck such as exact target decode or draft compute"}}
{"run":61,"commit":"abe969d","metric":8.570174,"metrics":{"tps":8.570174,"e2e_tps":1.266524,"spec_sec":25.266,"gen_tokens":32,"steps":17,"committed":33,"step_ms":219.64,"pack_ms":1.35,"draft_ms":150.5,"topk_ms":19.51,"exact_ms":48.24,"exact_decode_ms":0,"acceptance":1.941},"status":"discard","description":"test budget 32 with target feature window 160","timestamp":1777500462376,"segment":1,"confidence":18.755483015403495,"asi":{"hypothesis":"with the larger budget-32 proposal tree, a slightly longer target-feature window may improve draft proposal quality enough to offset extra pack/draft cost","rollback_reason":"target_feat_ctx=160 lost the budget-32 structural gain: steps rose from 16 to 17 and acceptance fell to 1.94, so TPS dropped","next_action_hint":"keep target feature window at 128 for budget-32; do not combine larger windows with larger budget on this workload"}}
{"run":62,"commit":"abe969d","metric":9.649133,"metrics":{"tps":9.649133,"e2e_tps":1.291572,"spec_sec":24.776,"gen_tokens":32,"steps":17,"committed":33,"step_ms":195.08,"pack_ms":0.95,"draft_ms":128.19,"topk_ms":18.6,"exact_ms":47.32,"exact_decode_ms":0,"acceptance":1.941},"status":"discard","description":"test budget 32 with smaller target feature window 112","timestamp":1777500568011,"segment":1,"confidence":18.510168209448402,"asi":{"hypothesis":"with budget 32, a smaller 112-token target-feature window may reduce draft compute enough to offset losing some proposal quality","rollback_reason":"draft/step time improved significantly, but steps rose to 17 and acceptance fell to 1.94; primary TPS stayed below the kept budget-32 best of 9.82","next_action_hint":"try an intermediate window such as 120 with budget 32, or repeat 112 if prioritizing lower draft latency over best TPS"}}
{"run":63,"commit":"abe969d","metric":8.489392,"metrics":{"tps":8.489392,"e2e_tps":1.270648,"spec_sec":25.184,"gen_tokens":32,"steps":17,"committed":33,"step_ms":221.73,"pack_ms":1.02,"draft_ms":152.51,"topk_ms":20.3,"exact_ms":47.87,"exact_decode_ms":0,"acceptance":1.941},"status":"discard","description":"test budget 32 with target feature window 120","timestamp":1777500665459,"segment":1,"confidence":18.755483015403495,"asi":{"hypothesis":"target_feat_ctx=120 may sit between 112 and 128, retaining some draft-cost savings while avoiding the acceptance loss seen at 112","rollback_reason":"120 did not recover budget-32 acceptance; steps stayed 17 and draft/step time was worse than 112 and 128, so primary TPS fell","next_action_hint":"do not continue fine-tuning between 112 and 128; use 128 for structural 16-step behavior or 112 only as lower-latency non-best variant"}}
{"run":64,"commit":"abe969d","metric":9.104151,"metrics":{"tps":9.104151,"e2e_tps":1.286174,"spec_sec":24.88,"gen_tokens":32,"steps":16,"committed":33,"step_ms":219.68,"pack_ms":1.07,"draft_ms":147.78,"topk_ms":19.46,"exact_ms":51.33,"exact_decode_ms":0,"acceptance":2.062},"status":"discard","description":"test budget 32 with DDTree micro-profile logging disabled","timestamp":1777500762894,"segment":1,"confidence":18.510168209448402,"asi":{"hypothesis":"per-step DDTree micro-profile logging may add overhead in the budget-32 configuration, so disabling it could improve production-like decode TPS","rollback_reason":"profile-disabled run preserved the 16-step/2.06 acceptance behavior but did not beat the kept budget-32 best; primary TPS stayed in the noisy ~9.0 band","next_action_hint":"profiling can remain disabled in production, but it is not a major optimization lever; move to draft compute or exact decode changes"}}
{"run":65,"commit":"abe969d","metric":8.427062,"metrics":{"tps":8.427062,"e2e_tps":1.266675,"spec_sec":25.263,"gen_tokens":32,"steps":17,"committed":33,"step_ms":223.37,"pack_ms":1.06,"draft_ms":153.86,"topk_ms":20.47,"exact_ms":47.95,"exact_decode_ms":0,"acceptance":1.941},"status":"discard","description":"test budget 32 with 32k context and q8 KV","timestamp":1777500859781,"segment":1,"confidence":18.755483015403495,"asi":{"hypothesis":"combine the budget-32 branch-coverage gain with the previously viable 32k/q8_0 context to see if q8 exact decode or lower context improves throughput","rollback_reason":"q8_0 + 32k lost the budget-32 structural gain: steps rose to 17 and acceptance fell to 1.94, with slower draft/step time","next_action_hint":"keep q4_0 64k for budget-32; q8_0/32k does not combine well with this proposal tree"}}
{"run":66,"commit":"abe969d","metric":8.867312,"metrics":{"tps":8.867312,"e2e_tps":1.283903,"spec_sec":24.924,"gen_tokens":32,"steps":17,"committed":33,"step_ms":212.28,"pack_ms":0.96,"draft_ms":143.28,"topk_ms":20.12,"exact_ms":47.89,"exact_decode_ms":0,"acceptance":1.941},"status":"discard","description":"test budget 32 with target feature window 116","timestamp":1777500957905,"segment":1,"confidence":18.510168209448402,"asi":{"hypothesis":"target_feat_ctx=116 may keep most of the draft-cost reduction seen at 112 while recovering some proposal quality toward the 128-window budget-32 path","rollback_reason":"116 kept lower draft cost but still lost the budget-32 structural win: steps stayed 17 and acceptance stayed 1.94, so primary TPS remained below best","next_action_hint":"stop testing sub-128 windows with budget 32; they lower per-step cost but consistently lose the 16-step acceptance behavior"}}
{"run":67,"commit":"abe969d","metric":8.537135,"metrics":{"tps":8.537135,"e2e_tps":1.284986,"spec_sec":24.903,"gen_tokens":32,"steps":17,"committed":33,"step_ms":220.49,"pack_ms":1.07,"draft_ms":151.17,"topk_ms":20.16,"exact_ms":48.05,"exact_decode_ms":0,"acceptance":1.941},"status":"discard","description":"test budget 32 with 32k context and q4 KV","timestamp":1777501067545,"segment":1,"confidence":18.755483015403495,"asi":{"hypothesis":"for prompts fitting under 32k, lowering n_ctx while keeping q4 KV may reduce decode memory/compute overhead and combine with the budget-32 proposal tree","rollback_reason":"32k context lost the budget-32 structural gain: steps rose to 17 and acceptance fell to 1.94, with TPS well below the 64k budget-32 best","next_action_hint":"keep 64k q4 for budget-32 on this workload; context reduction changes proposal/target behavior enough to lose acceptance"}}
{"run":68,"commit":"abe969d","metric":9.041183,"metrics":{"tps":9.041183,"e2e_tps":1.284161,"spec_sec":24.919,"gen_tokens":32,"steps":16,"committed":33,"step_ms":221.21,"pack_ms":1.14,"draft_ms":148.76,"topk_ms":20.16,"exact_ms":51.12,"exact_decode_ms":0,"acceptance":2.062},"status":"discard","description":"test budget 32 with target feature window 136","timestamp":1777501168604,"segment":1,"confidence":18.510168209448402,"asi":{"hypothesis":"target_feat_ctx=136 may preserve the 16-step budget-32 acceptance behavior while being close enough to 128 to avoid the overhead/regression seen at 160","rollback_reason":"136 preserved the 16-step and 2.06 acceptance behavior, but step time was around 221 ms and primary TPS stayed below the budget-32 best","next_action_hint":"128 remains the best window for budget-32; larger windows preserve acceptance but do not improve throughput"}}
{"run":69,"commit":"abe969d","metric":9.219989,"metrics":{"tps":9.219989,"e2e_tps":1.28236,"spec_sec":24.954,"gen_tokens":32,"steps":16,"committed":33,"step_ms":216.92,"pack_ms":1.09,"draft_ms":144.92,"topk_ms":19.97,"exact_ms":50.9,"exact_decode_ms":0,"acceptance":2.062},"status":"discard","description":"test budget 32 with draft top-k capped at 4","timestamp":1777502096247,"segment":1,"confidence":15.883380263274644,"asi":{"hypothesis":"budget 32 may not need K=8 alternatives per row; capping draft top-k to 4 could preserve the 16-step acceptance behavior while reducing top-k/tree overhead","rollback_reason":"K=4 preserved 16 steps and 2.06 acceptance but primary TPS stayed below the kept budget-32 best; topk did not materially improve","next_action_hint":"top-k cardinality is not the main cost; do not add a TOP_K knob unless needed for diagnostics"}}
{"run":70,"commit":"abe969d","metric":9.147875,"metrics":{"tps":9.147875,"e2e_tps":1.279335,"spec_sec":25.013,"gen_tokens":32,"steps":16,"committed":34,"step_ms":218.63,"pack_ms":1.08,"draft_ms":147.84,"topk_ms":17.36,"exact_ms":52.32,"exact_decode_ms":0,"acceptance":2.125},"status":"discard","description":"cap DDTree proposal rows to 8 for budget 32","timestamp":1777502354709,"segment":1,"confidence":15.856576119639561,"asi":{"hypothesis":"exact-validation path rarely needs all 15 draft rows; capping DDTree proposal rows to 8 may preserve acceptance while reducing CPU top-k scan cost","rollback_reason":"topk improved to 17.36 ms and acceptance stayed strong, but exact cost rose and total step time remained below the kept best; also broad prompts could need deeper rows","next_action_hint":"tree-row capping can reduce topk but is not enough alone; avoid keeping because it risks hurting prompts with long accepted runs"}}
{"run":71,"commit":"abe969d","metric":8.451759,"metrics":{"tps":8.451759,"e2e_tps":1.584547,"spec_sec":26.506,"gen_tokens":42,"steps":23,"committed":42,"step_ms":216.06,"pack_ms":1.03,"draft_ms":149.5,"topk_ms":20.1,"exact_ms":45.4,"exact_decode_ms":0,"acceptance":1.826},"status":"discard","description":"validate budget 32 on longer generation request","timestamp":1777502458355,"segment":1,"confidence":15.883380263274644,"asi":{"hypothesis":"longer generation should check whether the budget-32 16-step gain generalizes beyond the short 32-token sample and reduce overfitting risk","rollback_reason":"the prompt ended at 42 generated tokens and acceptance fell to 1.83; decode TPS below budget-32 short-run best","next_action_hint":"use an additional non-EOS prompt fixture for robust validation; this prompt is useful but not sufficient for long-run budget tuning"}}
{"run":72,"commit":"abe969d","metric":9.227646,"metrics":{"tps":9.227646,"e2e_tps":1.283852,"spec_sec":24.925,"gen_tokens":32,"steps":16,"committed":33,"step_ms":216.74,"pack_ms":1.08,"draft_ms":144.28,"topk_ms":20.3,"exact_ms":51.03,"exact_decode_ms":0,"acceptance":2.062},"status":"discard","description":"test pure best-first DDTree without greedy chain seed at budget 32","timestamp":1777502608147,"segment":1,"confidence":15.856576119639561,"asi":{"hypothesis":"chain_seed spends many budget-32 nodes on a deep greedy spine even though exact acceptance averages only about 2 tokens; pure best-first may allocate more shallow alternatives and improve proposal coverage","rollback_reason":"chain_seed=0 preserved the 16-step/2.06 acceptance behavior but did not beat the kept budget-32 best; no primary improvement and it may be less robust on prompts with long greedy runs","next_action_hint":"keep chain_seed enabled; if revisiting tree construction, use a hybrid depth cap rather than disabling the greedy spine entirely"}}
{"run":73,"commit":"abe969d","metric":9.040774,"metrics":{"tps":9.040774,"e2e_tps":1.281897,"spec_sec":24.963,"gen_tokens":32,"steps":16,"committed":33,"step_ms":221.22,"pack_ms":1.09,"draft_ms":148.96,"topk_ms":20.16,"exact_ms":50.98,"exact_decode_ms":0,"acceptance":2.062},"status":"discard","description":"hybrid DDTree chain seed capped to depth 4 at budget 32","timestamp":1777502734449,"segment":1,"confidence":15.776850112479453,"asi":{"hypothesis":"cap the pre-inserted greedy spine to depth 4 so budget-32 keeps robust shallow branch coverage while still preserving enough greedy-chain depth","rollback_reason":"chain-depth cap preserved the 16-step/2.06 acceptance pattern but step time stayed around 221 ms and did not beat the kept budget-32 best","next_action_hint":"hybrid chain-depth capping does not improve this workload; avoid further tree construction tweaks unless using a broader prompt corpus"}}
{"run":74,"commit":"abe969d","metric":9.173891,"metrics":{"tps":9.173891,"e2e_tps":1.284058,"spec_sec":24.921,"gen_tokens":32,"steps":16,"committed":33,"step_ms":218.01,"pack_ms":1.07,"draft_ms":146.71,"topk_ms":19.2,"exact_ms":51,"exact_decode_ms":0,"acceptance":2.062},"status":"discard","description":"test budget 32 with target feature window 124","timestamp":1777502840662,"segment":1,"confidence":15.556627806464379,"asi":{"hypothesis":"target_feat_ctx=124 may be the smallest window that preserves the budget-32 16-step acceptance behavior while slightly reducing draft compute versus 128","rollback_reason":"124 preserved 16 steps and 2.06 acceptance, but primary TPS stayed below the kept budget-32 best and within the noisy 9.0-9.2 band","next_action_hint":"128 remains the best kept setting; 124 is viable but not a primary improvement, so move away from window tuning"}}
{"run":75,"commit":"abe969d","metric":0,"metrics":{"tps":0,"e2e_tps":0,"spec_sec":21.522,"gen_tokens":0,"steps":1,"committed":0,"step_ms":0,"pack_ms":3.54,"draft_ms":201.54,"topk_ms":15.37,"exact_ms":0,"exact_decode_ms":0,"acceptance":0},"status":"crash","description":"fast rollback budget 14 with draft GPU layers 5","timestamp":1777502969042,"segment":1,"confidence":15.556627806464379,"asi":{"hypothesis":"budget 14 reduces persist to about 1.09 GiB; moving one draft layer off GPU may free enough VRAM for fast rollback while keeping draft faster than the known ngld3 path","rollback_reason":"persist allocation still OOMed at 1086.75 MiB, and draft time already rose to 201 ms with ngld5; no tokens generated","next_action_hint":"do not pursue small draft-offload reductions for persist fit; memory fragmentation or compute buffers still prevent budget14 persist allocation, and draft slowdown erodes expected gains"}}
{"run":76,"commit":"abe969d","metric":9.074822,"metrics":{"tps":9.074822,"e2e_tps":1.279232,"spec_sec":25.015,"gen_tokens":32,"steps":16,"committed":33,"step_ms":220.39,"pack_ms":1.09,"draft_ms":148.35,"topk_ms":19.94,"exact_ms":50.97,"exact_decode_ms":0,"acceptance":2.062},"status":"discard","description":"single memcpy fast path for one-token exact hidden-capture ingest","timestamp":1777503129647,"segment":1,"confidence":14.215306045687464,"asi":{"hypothesis":"exact validation ingests one hidden-capture token at a time; for n_tokens=1 the five layer slices are contiguous and can be copied into the target_feat ring with one memcpy instead of five","rollback_reason":"correctness passed but ingest stayed about 9.19 ms and primary TPS remained below the budget-32 best; hidden-capture retrieval/device transfer dominates, not the five small memcpy calls","next_action_hint":"do not optimize CPU memcpy in ingest further; a GPU-resident target_feat ring or capture path would be needed for meaningful ingest savings"}}
{"run":77,"commit":"abe969d","metric":8.99847,"metrics":{"tps":8.99847,"e2e_tps":1.275307,"spec_sec":25.092,"gen_tokens":32,"steps":16,"committed":34,"step_ms":222.26,"pack_ms":1.08,"draft_ms":150.91,"topk_ms":17.85,"exact_ms":52.38,"exact_decode_ms":0,"acceptance":2.125},"status":"discard","description":"cap draft logits outputs to tree rows for budget 32 row-8 proposal","timestamp":1777503251223,"segment":1,"confidence":13.227183816638313,"asi":{"hypothesis":"when capping proposal rows to 8, also avoid requesting draft logits for rows beyond that cap to reduce lm_head/logits work while keeping exact validation authoritative","rollback_reason":"correctness passed and topk remained lower, but draft time increased to 150.91 ms and total TPS fell below the budget-32 best; sparse logits flags do not reduce the dominant draft compute here","next_action_hint":"do not pursue output-row pruning in the draft path; focus on draft internal compute or exact target decode instead"}}
{"run":78,"commit":"abe969d","metric":9.495774,"metrics":{"tps":9.495774,"e2e_tps":1.288245,"spec_sec":24.84,"gen_tokens":32,"steps":16,"committed":33,"step_ms":210.62,"pack_ms":1.09,"draft_ms":138.52,"topk_ms":19.88,"exact_ms":51.09,"exact_decode_ms":0,"acceptance":2.062},"status":"discard","description":"test DDTree budget 26 below 28/32 threshold","timestamp":1777503357658,"segment":1,"confidence":12.929750236155215,"asi":{"hypothesis":"budget 26 may be the smallest budget that preserves the 16-step acceptance gain while reducing tree/proposal overhead versus budget 28/32","rollback_reason":"budget 26 preserved 16 steps and improved step time to 210.62 ms, but primary TPS remained below the kept budget-32 best of 9.82","next_action_hint":"budget 26 is promising and less noisy than nearby budgets; test budget 24 or repeat 26 before considering changing the configured default"}}
{"run":79,"commit":"abe969d","metric":9.011039,"metrics":{"tps":9.011039,"e2e_tps":1.284883,"spec_sec":24.905,"gen_tokens":32,"steps":16,"committed":33,"step_ms":221.95,"pack_ms":1.08,"draft_ms":149.2,"topk_ms":19.99,"exact_ms":51.65,"exact_decode_ms":0,"acceptance":2.062},"status":"discard","description":"test DDTree budget 24 after promising budget 26","timestamp":1777503455626,"segment":1,"confidence":12.590074250645914,"asi":{"hypothesis":"budget 24 may be the smallest budget that retains the 16-step acceptance gain while lowering overhead more than budget 26","rollback_reason":"budget 24 preserved 16 steps but step time regressed to 221.95 ms and TPS fell well below budget 26 and the kept best","next_action_hint":"budget 26 looks more promising than 24; repeat 26 or test 25/27 if continuing budget tuning"}}
{"run":80,"commit":"abe969d","metric":8.965795,"metrics":{"tps":8.965795,"e2e_tps":1.280974,"spec_sec":24.981,"gen_tokens":32,"steps":16,"committed":33,"step_ms":223.07,"pack_ms":1.07,"draft_ms":149.46,"topk_ms":20.71,"exact_ms":51.79,"exact_decode_ms":0,"acceptance":2.062},"status":"discard","description":"repeat DDTree budget 26 to check noise","timestamp":1777503555898,"segment":1,"confidence":10.891133809356086,"asi":{"hypothesis":"repeat budget 26 because the previous 9.50 TPS result may indicate a smaller-budget sweet spot or may be noise","rollback_reason":"repeat preserved 16-step acceptance but regressed to 8.97 TPS, confirming the earlier 9.50 budget-26 run was mostly noise","next_action_hint":"do not keep budget 26; budget 32 remains the configured best despite noisy single-run highs"}}
{"run":81,"commit":"a4884f4","metric":10.147906,"metrics":{"tps":10.147906,"e2e_tps":1.29749,"spec_sec":24.663,"gen_tokens":32,"steps":14,"committed":33,"step_ms":225.24,"pack_ms":1.12,"draft_ms":145.35,"topk_ms":20.04,"exact_ms":58.69,"exact_decode_ms":0,"acceptance":2.357},"status":"keep","description":"add proposal-temperature override and test budget 32 at temp 1.0","timestamp":1777503715337,"segment":1,"confidence":11.576312390556494,"asi":{"hypothesis":"greedy sampling uses --temp 0, but DDTree proposal scoring should not necessarily use near-zero temperature; a separate proposal temperature can allocate budget to useful alternatives while exact validation preserves bit-equal output","result":"LLAMA_DDTREE_PROPOSAL_TEMP=1.0 with budget 32 reduced steps from 16 to 14 and improved decode TPS to 10.15 despite higher exact cost per step","next_action_hint":"confirm on repeat and sweep proposal_temp around 0.5/0.75/1.25; if robust, consider making proposal temperature distinct from sampler temperature by default"}}
{"run":82,"commit":"f282526","metric":10.443251,"metrics":{"tps":10.443251,"e2e_tps":1.302932,"spec_sec":24.56,"gen_tokens":32,"steps":14,"committed":33,"step_ms":218.87,"pack_ms":1.11,"draft_ms":140.22,"topk_ms":19.49,"exact_ms":58.02,"exact_decode_ms":0,"acceptance":2.357},"status":"keep","description":"tune DDTree proposal temperature to 0.75","timestamp":1777503819076,"segment":1,"confidence":12.243787036262265,"asi":{"hypothesis":"proposal_temp=0.75 may keep the useful branch diversity from temp 1.0 while weighting top draft candidates more strongly, reducing per-step overhead or improving acceptance stability","result":"proposal_temp=0.75 matched the 14-step/2.36 acceptance gain and improved decode TPS to 10.44 via lower step time than temp 1.0","next_action_hint":"sweep 0.5 and 0.9/1.25, then repeat the best to check noise; validate on a second prompt before hardcoding any default"}}
{"run":83,"commit":"f282526","metric":9.354263,"metrics":{"tps":9.354263,"e2e_tps":1.28123,"spec_sec":24.976,"gen_tokens":32,"steps":15,"committed":34,"step_ms":228.06,"pack_ms":1.09,"draft_ms":149.88,"topk_ms":20.81,"exact_ms":56.24,"exact_decode_ms":0,"acceptance":2.267},"status":"discard","description":"sweep DDTree proposal temperature 0.5","timestamp":1777503920054,"segment":1,"confidence":12.281436942986181,"asi":{"hypothesis":"proposal_temp=0.5 may focus tree budget on higher-confidence draft branches and reduce wasted alternatives versus 0.75/1.0","rollback_reason":"0.5 reduced acceptance relative to 0.75 and needed 15 steps; step time also worsened, so primary TPS dropped below current best","next_action_hint":"proposal_temp around 0.75 remains best; test 0.9 or 0.625 next rather than going lower"}}
{"run":84,"commit":"f282526","metric":10.070113,"metrics":{"tps":10.070113,"e2e_tps":1.299229,"spec_sec":24.63,"gen_tokens":32,"steps":14,"committed":33,"step_ms":226.98,"pack_ms":1.12,"draft_ms":146.35,"topk_ms":20.69,"exact_ms":58.8,"exact_decode_ms":0,"acceptance":2.357},"status":"discard","description":"sweep DDTree proposal temperature 0.9","timestamp":1777504018992,"segment":1,"confidence":12.265451874159854,"asi":{"hypothesis":"proposal_temp=0.9 may keep the 14-step branch-diversity gain of 0.75/1.0 while being less noisy than 0.75","rollback_reason":"0.9 preserved 14 steps and 2.36 acceptance but step time was worse than 0.75, so primary TPS fell below current best","next_action_hint":"0.75 remains the best tested proposal temperature; try 0.625/0.8 or repeat 0.75 for stability"}}
{"run":85,"commit":"f282526","metric":9.341975,"metrics":{"tps":9.341975,"e2e_tps":1.288712,"spec_sec":24.831,"gen_tokens":32,"steps":15,"committed":34,"step_ms":228.36,"pack_ms":1.11,"draft_ms":149.89,"topk_ms":21.18,"exact_ms":56.13,"exact_decode_ms":0,"acceptance":2.267},"status":"discard","description":"sweep DDTree proposal temperature 0.625","timestamp":1777504113549,"segment":1,"confidence":12.249508362405432,"asi":{"hypothesis":"proposal_temp=0.625 may sit between 0.5 and 0.75, preserving more branch diversity than 0.5 while focusing proposals more than 0.75","rollback_reason":"0.625 behaved like 0.5: 15 steps, 2.27 acceptance, and slower step time; primary TPS well below 0.75","next_action_hint":"lower side of temp sweep is done; 0.75 remains best, test 0.8/0.7 or repeat 0.75"}}
{"run":86,"commit":"f282526","metric":9.949568,"metrics":{"tps":9.949568,"e2e_tps":1.292303,"spec_sec":24.762,"gen_tokens":32,"steps":14,"committed":33,"step_ms":229.73,"pack_ms":1.1,"draft_ms":148.29,"topk_ms":21.43,"exact_ms":58.88,"exact_decode_ms":0,"acceptance":2.357},"status":"discard","description":"sweep DDTree proposal temperature 0.8","timestamp":1777504213478,"segment":1,"confidence":12.153903309794075,"asi":{"hypothesis":"proposal_temp=0.8 may be close to the observed 0.75 optimum while possibly more robust than 0.75","rollback_reason":"0.8 kept the 14-step/2.36 acceptance pattern but step time and topk were worse, so primary TPS stayed below 0.75","next_action_hint":"0.75 remains the best tested proposal temperature; test 0.7 or repeat 0.75 for stability"}}
{"run":87,"commit":"c499e6d","metric":10.507099,"metrics":{"tps":10.507099,"e2e_tps":1.308686,"spec_sec":24.452,"gen_tokens":32,"steps":14,"committed":33,"step_ms":217.54,"pack_ms":1.11,"draft_ms":138.33,"topk_ms":19.95,"exact_ms":58.11,"exact_decode_ms":0,"acceptance":2.357},"status":"keep","description":"sweep DDTree proposal temperature 0.7","timestamp":1777504313200,"segment":1,"confidence":12.22183088063815,"asi":{"hypothesis":"proposal_temp=0.7 may be slightly better than 0.75 by keeping the same 14-step branch diversity while weighting top candidates a little more strongly","result":"0.7 preserved the 14-step/2.36 acceptance behavior and improved decode TPS to 10.51 with lower draft/step time than 0.75","next_action_hint":"repeat 0.7 and test 0.68/0.72; validate on additional prompt before making proposal_temp=0.7 a hard default"}}
{"run":88,"commit":"c499e6d","metric":10.013205,"metrics":{"tps":10.013205,"e2e_tps":1.300654,"spec_sec":24.603,"gen_tokens":32,"steps":14,"committed":32,"step_ms":228.27,"pack_ms":1.11,"draft_ms":149.75,"topk_ms":20.67,"exact_ms":56.7,"exact_decode_ms":0,"acceptance":2.286},"status":"discard","description":"sweep DDTree proposal temperature 0.68","timestamp":1777504416257,"segment":1,"confidence":12.743318525446139,"asi":{"hypothesis":"proposal_temp=0.68 may refine the 0.7 optimum by slightly increasing confidence weighting without falling into the 0.625/0.5 lower-acceptance regime","rollback_reason":"0.68 kept 14 steps but committed/acceptance fell and draft/step time worsened, so primary TPS dropped below 0.7","next_action_hint":"0.7 remains better than the lower side; test 0.72 or repeat 0.7 for noise"}}
{"run":89,"commit":"c499e6d","metric":10.111096,"metrics":{"tps":10.111096,"e2e_tps":1.280974,"spec_sec":24.981,"gen_tokens":32,"steps":14,"committed":33,"step_ms":226.06,"pack_ms":1.11,"draft_ms":144.8,"topk_ms":21.34,"exact_ms":58.76,"exact_decode_ms":0,"acceptance":2.357},"status":"discard","description":"sweep DDTree proposal temperature 0.72","timestamp":1777504517855,"segment":1,"confidence":12.766361853364844,"asi":{"hypothesis":"proposal_temp=0.72 may sit close to the 0.7 optimum while preserving 0.75-like acceptance stability","rollback_reason":"0.72 preserved 14 steps and 2.36 acceptance but topk/step time were worse than 0.7; primary TPS below best","next_action_hint":"proposal_temp 0.7 remains best; repeat 0.7 for confirmation or test nearby 0.69/0.71 only if needed"}}
{"run":90,"commit":"c499e6d","metric":9.937888,"metrics":{"tps":9.937888,"e2e_tps":1.292146,"spec_sec":24.765,"gen_tokens":32,"steps":14,"committed":33,"step_ms":230,"pack_ms":1.11,"draft_ms":149.15,"topk_ms":21.08,"exact_ms":58.63,"exact_decode_ms":0,"acceptance":2.357},"status":"discard","description":"repeat DDTree proposal temperature 0.7","timestamp":1777504613299,"segment":1,"confidence":12.802747380091832,"asi":{"hypothesis":"repeat proposal_temp=0.7 to check whether the 10.51 TPS run was stable or mostly noise","rollback_reason":"repeat preserved 14-step/2.36 acceptance but step time regressed to 230 ms, so primary TPS was below the kept best; structural gain is stable but single-run TPS is noisy","next_action_hint":"treat proposal_temp=0.7 as useful for fewer steps but compare future changes against a 9.9-10.5 TPS noise range"}}
{"run":91,"commit":"c499e6d","metric":3.149191,"metrics":{"tps":3.149191,"e2e_tps":1.013588,"spec_sec":31.571,"gen_tokens":32,"steps":14,"committed":33,"step_ms":725.81,"pack_ms":1.16,"draft_ms":149.93,"topk_ms":20.87,"exact_ms":59.92,"exact_decode_ms":0,"acceptance":2.357},"status":"discard","description":"diagnostic batched-vs-exact trace with proposal temp 0.7","timestamp":1777505377073,"segment":1,"confidence":12.319068657292135,"asi":{"hypothesis":"enable batched tree verify trace to measure batched posterior vs exact validation agreement and margin for possible high-confidence exact-skip rule","rollback_reason":"diagnostic trace adds target-tree/snapshot overhead, so primary TPS is expectedly worse and not a runtime optimization","result":"for visible traced tail, batched_exact_same=14 and diff=0 in stats; parsed 3 trace lines all had batched_commit_n equal exact commit_n with min margin 2.12, avg margin 7.62","next_action_hint":"improve trace capture to emit compact aggregate metrics for all steps instead of relying on tailed logs; this run suggests current prompt/temp0.7 has no batched/exact divergence, but broader prompts are needed before trusting batched"}}
{"run":92,"commit":"c499e6d","metric":3.209461,"metrics":{"tps":3.209461,"e2e_tps":1.019303,"spec_sec":31.394,"gen_tokens":32,"steps":14,"committed":33,"step_ms":712.18,"pack_ms":1.16,"draft_ms":152.9,"topk_ms":20.95,"exact_ms":0,"exact_decode_ms":0,"acceptance":2.357},"status":"discard","description":"fast-batched correctness probe at proposal temp 0.7","timestamp":1777505519305,"segment":1,"confidence":11.723331153709331,"asi":{"hypothesis":"if proposal_temp=0.7 eliminates batched/exact divergence on this prompt, fast-batched can skip exact validation and remain bit-equal, clarifying whether exact skip is correctness-safe before optimizing rollback","rollback_reason":"correctness passed, but snapshot+replay fast-batched path is far slower: step=712 ms due snapshot 177.6 ms, target_tree 118.6 ms, replay 53.2 ms; primary TPS much worse","result":"batched path produced bit-equal output with budget32/temp0.7 on this prompt; stats show committed=33, batched_committed=33, max_commit=4, no divergence in final token sequence","next_action_hint":"exact-skip is only useful if paired with no-snapshot fast rollback or another cheap state-commit path; next root-cause work should quantify/compress persist buffers or add aggregate batched/exact margin diagnostics across prompts"}}
{"run":93,"commit":"c499e6d","metric":0,"metrics":{"tps":0,"e2e_tps":0,"spec_sec":21.672,"gen_tokens":0,"steps":1,"committed":0,"step_ms":0,"pack_ms":3.69,"draft_ms":154.36,"topk_ms":14.63,"exact_ms":0,"exact_decode_ms":0,"acceptance":0},"status":"crash","description":"diagnose fast rollback persist memory split at budget 14","timestamp":1777505657370,"segment":1,"confidence":11.723331153709331,"asi":{"hypothesis":"quantify DFlash fast-rollback persist allocation into SSM vs conv components to see whether conv F32 is the main compressible memory target","rollback_reason":"diagnostic logging changed code and fast rollback still OOMed before generating tokens, so not a throughput improvement","result":"budget14 planned persist split: ssm=1008.00 MiB, conv=78.75 MiB, total=1086.75 MiB; CUDA allocation failed for the full 1086.75 MiB","next_action_hint":"conv F32 is only about 7.2% of budget14 persist; compressing conv alone cannot make fast rollback fit. The dominant target is SSM persist, already F16 on CUDA, or freeing/defragmenting ~1.1 GiB elsewhere"}}
{"run":94,"commit":"c499e6d","metric":0,"metrics":{"tps":0,"e2e_tps":0,"spec_sec":21.723,"gen_tokens":0,"steps":1,"committed":0,"step_ms":0,"pack_ms":3.53,"draft_ms":137.37,"topk_ms":13.7,"exact_ms":0,"exact_decode_ms":0,"acceptance":0},"status":"crash","description":"fast rollback fit probe with budget 8 persist","timestamp":1777505801493,"segment":1,"confidence":11.723331153709331,"asi":{"hypothesis":"using a much smaller budget 8 should reduce DFlash fast-rollback persist to about 621 MiB; if it fits, fast-batched plus rollback can reveal whether cheap state commit can offset lower acceptance","rollback_reason":"budget8 persist still OOMed before any token was generated, so no throughput result; snapshot fallback was disabled intentionally to test true fast rollback fit","result":"CUDA allocation failed for 621.00 MiB persist at n_tokens=8; even the small-budget fast rollback path cannot fit in the current 64k/full-target/full-draft memory envelope","next_action_hint":"fast rollback needs either freeing hundreds of MiB elsewhere before persist allocation or changing allocation strategy/lifetime; smaller budget alone is not enough, so investigate target/draft compute buffers or lower n_ctx/offload as controlled diagnostics"}}
{"run":95,"commit":"c499e6d","metric":8.005844,"metrics":{"tps":8.005844,"e2e_tps":1.269589,"spec_sec":25.205,"gen_tokens":32,"steps":18,"committed":33,"step_ms":222.06,"pack_ms":1.08,"draft_ms":132.93,"topk_ms":15.31,"exact_ms":0,"exact_decode_ms":0,"acceptance":1.833},"status":"discard","description":"fast rollback fit probe at 32k context with budget 8","timestamp":1777505918594,"segment":1,"confidence":11.195080202773184,"asi":{"hypothesis":"lowering n_ctx from 64k to 32k may free enough GPU memory for even budget8 fast rollback, revealing whether fast-batched plus cheap state commit can be viable when persist fits","rollback_reason":"32k/budget8 fast rollback fit and remained bit-equal, but acceptance fell to 1.83 over 18 steps and TPS stayed below the current exact-validation best","result":"persist allocated successfully: 621.00 MiB, fast_rollback=18, snapshot_replays=0, exact=0, target_tree=70.61 ms, rollback=0.65 ms, step=222.06 ms, correctness PASS","next_action_hint":"fast rollback itself is cheap when it fits; the remaining costs are target_tree and reduced acceptance at budget8/32k. Test 32k with larger budget if memory allows, or focus on freeing 64k memory for budget32 persist rather than optimizing rollback kernels."}}
{"run":96,"commit":"c499e6d","metric":0,"metrics":{"tps":0,"e2e_tps":0,"spec_sec":0,"gen_tokens":0,"steps":0,"committed":0,"step_ms":0,"pack_ms":0,"draft_ms":0,"topk_ms":0,"exact_ms":0,"exact_decode_ms":0,"acceptance":0},"status":"crash","description":"fast rollback fit probe at 32k context with budget 14","timestamp":1777506020282,"segment":1,"confidence":11.195080202773184,"asi":{"hypothesis":"32k context may free enough GPU memory for budget14 fast rollback persist, improving acceptance versus the budget8 fast-batched rollback run while keeping cheap state commit","rollback_reason":"budget14 persist allocated at 32k, but subsequent CUDA graph compute hit out-of-memory before benchmark metrics were emitted, so no throughput result","result":"persist allocation succeeded for 64 layers, 14 tokens, 1086.75 MiB, then CUDA OOM occurred in ggml_backend_cuda_graph_compute during target tree decode","next_action_hint":"32k frees enough memory for budget14 persist allocation but not enough working-memory headroom for compute; test an intermediate smaller budget such as 10 or 12, or reduce graph compute buffer pressure before revisiting budget14"}}
{"run":97,"commit":"c499e6d","metric":6.498729,"metrics":{"tps":6.498729,"e2e_tps":1.223055,"spec_sec":26.164,"gen_tokens":32,"steps":19,"committed":33,"step_ms":259.16,"pack_ms":1.06,"draft_ms":156.53,"topk_ms":16.75,"exact_ms":0,"exact_decode_ms":0,"acceptance":1.737},"status":"discard","description":"fast rollback fit probe at 32k context with budget 12","timestamp":1777506119588,"segment":1,"confidence":11.176576899258043,"asi":{"hypothesis":"32k context with budget12 may be the largest fast-rollback configuration that leaves enough compute headroom, improving acceptance versus budget8 without the budget14 CUDA OOM","rollback_reason":"budget12 fast rollback fit and stayed bit-equal, but acceptance fell to 1.74 over 19 steps and step time rose to 259 ms; primary TPS was far below the budget32 exact-validation best and below the 32k budget8 rollback probe","result":"persist allocated successfully at 931.50 MiB, fast_rollback=19, snapshot_replays=0, exact=0, target_tree=81.96 ms, rollback=0.66 ms, correctness PASS","next_action_hint":"budget12 worsens both acceptance and step time versus budget8 on this prompt; do not pursue 32k rollback budget sweep as an optimization. The useful finding is that rollback is cheap when persist fits; focus on freeing 64k memory or reducing target_tree cost."}}
{"run":98,"commit":"c499e6d","metric":7.723089,"metrics":{"tps":7.723089,"e2e_tps":1.206091,"spec_sec":26.532,"gen_tokens":32,"steps":18,"committed":33,"step_ms":230.19,"pack_ms":1.11,"draft_ms":137.84,"topk_ms":15.24,"exact_ms":0,"exact_decode_ms":0,"acceptance":1.833},"status":"discard","description":"64k fast rollback fit probe with smaller batch buffers and budget 8","timestamp":1777506225210,"segment":1,"confidence":11.158134659584107,"asi":{"hypothesis":"reducing n_batch/n_ubatch from 512 to 256 may free enough 64k GPU working memory for budget8 fast rollback persist while preserving the 64k prompt/proposal behavior better than lowering n_ctx to 32k","rollback_reason":"fast rollback fit and correctness passed, but budget8 acceptance stayed low at 1.83 over 18 steps and TPS remained below the budget32 exact-validation best","result":"64k with n_batch=256,n_ubatch=256 allocated 621.00 MiB persist, fast_rollback=18, snapshot_replays=0, exact=0, target_tree=73.86 ms, rollback=0.62 ms, correctness PASS","next_action_hint":"smaller batch buffers solve the 64k budget8 persist fit, but budget8 acceptance is too low; test whether n_batch=256 allows budget14 or higher persist, otherwise freeing memory alone at tiny budget is not enough"}}
{"run":99,"commit":"c499e6d","metric":7.011148,"metrics":{"tps":7.011148,"e2e_tps":1.201021,"spec_sec":26.644,"gen_tokens":32,"steps":17,"committed":33,"step_ms":268.48,"pack_ms":1.09,"draft_ms":160.71,"topk_ms":16.82,"exact_ms":0,"exact_decode_ms":0,"acceptance":1.941},"status":"discard","description":"64k fast rollback fit probe with smaller batch buffers and budget 14","timestamp":1777506326499,"segment":1,"confidence":10.999616549456363,"asi":{"hypothesis":"with n_batch/n_ubatch reduced to 256, 64k may have enough memory for budget14 fast rollback persist, improving acceptance over budget8 while keeping cheap rollback state commits","rollback_reason":"budget14 fast rollback fit and stayed bit-equal, but step time rose to 268 ms and acceptance only improved to 1.94, so primary TPS remained below both budget8 rollback and the budget32 exact-validation best","result":"64k n_batch=256,n_ubatch=256 allocated 1086.75 MiB persist, fast_rollback=17, snapshot_replays=0, exact=0, target_tree=86.66 ms, rollback=0.64 ms, correctness PASS","next_action_hint":"n_batch reduction frees enough memory for budget14, but target_tree+draft cost dominate and acceptance remains below the budget32 exact path; test whether budget22/32 fit with n_batch=256 only as a memory map, otherwise stop fast-rollback budget sweep and investigate target_tree cost or GPU target_feat ring."}}
{"run":100,"commit":"c499e6d","metric":0,"metrics":{"tps":0,"e2e_tps":0,"spec_sec":22.457,"gen_tokens":0,"steps":1,"committed":0,"step_ms":0,"pack_ms":3.57,"draft_ms":157.23,"topk_ms":20.94,"exact_ms":0,"exact_decode_ms":0,"acceptance":0},"status":"crash","description":"64k fast rollback fit probe with smaller batch buffers and budget 22","timestamp":1777506423953,"segment":1,"confidence":10.999616549456363,"asi":{"hypothesis":"with n_batch/n_ubatch reduced to 256, 64k may have enough memory for budget22 fast rollback persist, restoring more acceptance than budget8/14 while avoiding exact validation","rollback_reason":"budget22 persist still OOMed before generating tokens, so no throughput result; snapshot fallback was disabled intentionally to test true fast rollback fit","result":"CUDA allocation failed for 1707.75 MiB persist at n_tokens=22 even with n_batch=256,n_ubatch=256; budget14 is the largest tested 64k rollback configuration that fits, but it is too slow","next_action_hint":"do not pursue higher budget fast rollback under current 64k/full-draft memory envelope. Fast rollback needs either a smaller persist representation or major memory freeing; shift to target_tree cost reduction or GPU-resident target_feat ring diagnostics."}}
{"run":101,"commit":"c499e6d","metric":0,"metrics":{"tps":0,"e2e_tps":0,"spec_sec":26.823,"gen_tokens":0,"steps":1,"committed":0,"step_ms":0,"pack_ms":1.57,"draft_ms":153.13,"topk_ms":21.01,"exact_ms":0,"exact_decode_ms":0,"acceptance":0},"status":"crash","description":"64k fast rollback fit probe with n_batch 128 and budget 22","timestamp":1777506525928,"segment":1,"confidence":10.999616549456363,"asi":{"hypothesis":"reducing n_batch/n_ubatch further from 256 to 128 may free enough 64k GPU memory for budget22 fast rollback persist, restoring acceptance while avoiding exact validation","rollback_reason":"budget22 persist still OOMed before generating tokens even with n_batch=128; snapshot fallback was disabled intentionally to test true fast rollback fit","result":"CUDA allocation failed for 1707.75 MiB persist at n_tokens=22; reducing n_batch below 256 did not free enough additional memory, while prompt_ingests doubled to 160","next_action_hint":"stop trying to fit budget22/32 fast rollback via n_batch reduction. Need different persist representation, moving buffers off GPU, or target_tree/GPU target_feat root-cause work."}}
{"run":102,"commit":"c499e6d","metric":3.404755,"metrics":{"tps":3.404755,"e2e_tps":1.040312,"spec_sec":30.76,"gen_tokens":32,"steps":14,"committed":33,"step_ms":671.33,"pack_ms":1.15,"draft_ms":144.48,"topk_ms":20.62,"exact_ms":59.5,"exact_decode_ms":0,"acceptance":2.357},"status":"discard","description":"aggregate batched-vs-exact margin diagnostics at proposal temp 0.7","timestamp":1777506711000,"segment":1,"confidence":10.845539326886065,"asi":{"hypothesis":"add compact aggregate margin metrics for all diagnostic batched tree steps to evaluate whether a high-confidence exact-skip rule could be safe without relying on truncated per-node trace logs","rollback_reason":"diagnostic target-tree verification and logging are expectedly much slower than the runtime exact-validation path, so primary TPS is below best and the code is not a production optimization","result":"on the current prompt with budget32/temp0.7, batched_exact_same=14 and diff=0; aggregate margin steps=14, batched_min_avg=9.087, batched_min_min=0.895, ge1 same=13/13, ge2 same=12/12, ge5 same=9/9","next_action_hint":"current prompt supports a confidence-gated exact skip with thresholds >=1 or >=2, but this is not enough to ship; validate aggregate margins on additional prompts before trusting batched posterior, and pair any exact skip with cheap state commit because snapshot+replay is slower."}}
{"run":103,"commit":"c499e6d","metric":6.832497,"metrics":{"tps":6.832497,"e2e_tps":1.205636,"spec_sec":26.542,"gen_tokens":32,"steps":17,"committed":33,"step_ms":275.5,"pack_ms":1.1,"draft_ms":157.54,"topk_ms":20.23,"exact_ms":0,"exact_decode_ms":0,"acceptance":1.941},"status":"discard","description":"64k fast rollback fit probe with smaller batch buffers and budget 16","timestamp":1777506853395,"segment":1,"confidence":10.715120218615038,"asi":{"hypothesis":"budget16 may be a 64k fast-rollback middle point that still fits with n_batch/n_ubatch=256 while improving acceptance over budget8/14 enough to offset target_tree cost","rollback_reason":"budget16 fast rollback fit and stayed bit-equal, but acceptance remained 1.94 like budget14 while step time worsened to 275.5 ms, so primary TPS stayed far below the budget32 exact-validation best","result":"64k n_batch=256,n_ubatch=256 allocated 1242.00 MiB persist, fast_rollback=17, snapshot_replays=0, exact=0, target_tree=93.03 ms, rollback=0.68 ms, correctness PASS","next_action_hint":"fast rollback parameter sweep is exhausted: budget8/14/16 fit but are slow, budget22 OOMs. Move to target_tree cost reduction, broader batched/exact confidence validation, or GPU-resident target_feat ring instead of more budget tuning."}}
{"run":104,"commit":"c499e6d","metric":3.414521,"metrics":{"tps":3.414521,"e2e_tps":1.03795,"spec_sec":30.83,"gen_tokens":32,"steps":14,"committed":33,"step_ms":669.41,"pack_ms":1.16,"draft_ms":146.87,"topk_ms":20.93,"exact_ms":60.27,"exact_decode_ms":0,"acceptance":2.357},"status":"discard","description":"target-only chain decode timing diagnostic attempt","timestamp":1777507043307,"segment":1,"confidence":10.587800455864077,"asi":{"hypothesis":"instrument the chain reference path to quantify target-only one-token decode time and compare it with DDTree exact validation, clarifying whether the target-only reference mismatch comes from target decode overhead","rollback_reason":"diagnostic output was not surfaced because autoresearch.sh only tails the last 120 lines, and a preserved TRACE env pass-through accidentally enabled expensive target-tree tracing for every run; primary TPS is therefore not comparable","result":"the run still passed correctness but showed trace overhead: target_tree=118.29 ms and snapshot=150.34 ms in the exact path; chain timing detail needs a rerun after fixing TRACE pass-through/output capture","next_action_hint":"fix autoresearch.sh so empty LLAMA_DDTREE_TRACE does not enable tracing, then rerun the chain timing diagnostic with enough output to capture chain timing detail"}}
{"run":105,"commit":"9fbce93","metric":11.924636,"metrics":{"tps":11.924636,"e2e_tps":1.328573,"spec_sec":24.086,"gen_tokens":32,"steps":14,"committed":33,"step_ms":191.68,"pack_ms":1.1,"draft_ms":114.78,"topk_ms":17.91,"exact_ms":57.86,"exact_decode_ms":0,"acceptance":2.357},"status":"keep","description":"fix empty trace pass-through and add chain timing diagnostic hook","timestamp":1777507150730,"segment":1,"confidence":13.950437243906348,"asi":{"hypothesis":"the preserved autoresearch trace env pass-through was accidentally enabling target-tree diagnostics even when LLAMA_DDTREE_TRACE was empty; fixing it should restore normal exact-validation timing and allow a target-only chain timing diagnostic hook","result":"normal exact-validation path restored: target_tree=0, snapshot=0, step=191.68 ms, draft=114.78 ms, exact=57.86 ms, acceptance=2.36, tps=11.92; this is a measurement/harness fix rather than a product-code speedup","next_action_hint":"rerun once to confirm the 11.9 TPS result is not noise, and increase autoresearch output capture further if chain timing detail is still needed"}}
{"run":106,"commit":"9fbce93","metric":10.063906,"metrics":{"tps":10.063906,"e2e_tps":1.307083,"spec_sec":24.482,"gen_tokens":32,"steps":14,"committed":33,"step_ms":227.12,"pack_ms":1.11,"draft_ms":146.94,"topk_ms":20.58,"exact_ms":58.45,"exact_decode_ms":0,"acceptance":2.357},"status":"discard","description":"repeat normal exact-validation path after trace pass-through fix","timestamp":1777507233994,"segment":1,"confidence":13.654565431988555,"asi":{"hypothesis":"repeat the restored no-trace exact-validation path to determine whether the 11.92 TPS run was a real speedup or timing noise","rollback_reason":"repeat preserved the 14-step/2.36 acceptance behavior and no target_tree overhead, but step time returned to 227 ms and primary TPS fell below the kept 11.92 run","result":"target_tree=0 and snapshot=0 confirm the trace pass-through fix works; the 11.92 TPS run was mostly draft/step-time noise, while normal performance remains around 10.0 TPS","next_action_hint":"continue comparing future changes against a noisy 10.0-11.9 TPS range; if chain timing detail is still needed, make autoresearch.sh print the chain timing lines explicitly instead of relying on tail length."}}
{"run":107,"commit":"9fbce93","metric":9.951734,"metrics":{"tps":9.951734,"e2e_tps":1.299756,"spec_sec":24.62,"gen_tokens":32,"steps":14,"committed":33,"step_ms":229.68,"pack_ms":1.12,"draft_ms":149.57,"topk_ms":20.36,"exact_ms":58.6,"exact_decode_ms":0,"acceptance":2.357},"status":"discard","description":"surface target-only chain decode timing diagnostic","timestamp":1777507363548,"segment":1,"confidence":13.340332316400888,"asi":{"hypothesis":"print the chain reference timing lines from the full benchmark log so we can compare target-only one-token decode against DDTree exact validation and identify whether target decode itself is anomalously slow","rollback_reason":"this was a diagnostic harness output change and primary TPS stayed below the kept best; no production code speedup","result":"target-only chain after prompt has decode_avg=24.79 ms over 32 tokens, while DDTree exact validation is 58.60 ms per speculative step plus draft/topk; the target-only server reference gap is therefore not explained by base one-token target decode alone","next_action_hint":"investigate why DDTree exact validation costs ~58 ms/step versus target-only 1-token decode ~25 ms, likely hidden capture, seq_rm/cache state, or validation loop overhead; profile exact validation internals before more proposal tuning"}}
{"run":108,"commit":"9fbce93","metric":10.245246,"metrics":{"tps":10.245246,"e2e_tps":1.309436,"spec_sec":24.438,"gen_tokens":32,"steps":14,"committed":33,"step_ms":223.1,"pack_ms":1.11,"draft_ms":143.19,"topk_ms":20.11,"exact_ms":58.66,"exact_decode_ms":58.19,"acceptance":2.357},"status":"discard","description":"profile exact validation decode/sample timing","timestamp":1777507489027,"segment":1,"confidence":12.942229525616327,"asi":{"hypothesis":"instrument exact validation internals to determine whether the ~58 ms exact cost comes from llama_decode, logits sampling, advance callbacks, or driver overhead","rollback_reason":"diagnostic timing output only; primary TPS stayed below the kept best and there is no product-code optimization in this change","result":"exact validation is almost entirely target llama_decode: exact=58.66 ms/step, exact_decode=58.19 ms/step, exact_sample=0.41 ms/step, exact_nodes=33 over 14 steps; target-only chain decode remains 24.69 ms/token","next_action_hint":"focus on why exact validation llama_decode is ~2.35x slower per token than target-only chain decode: hidden capture enabled during spec, seq_rm/KV state mutations, or context state after draft/tree operations. Run a controlled chain-with-hidden-capture timing diagnostic next."}}
{"run":109,"commit":"9fbce93","metric":10.373578,"metrics":{"tps":10.373578,"e2e_tps":1.315735,"spec_sec":24.321,"gen_tokens":32,"steps":14,"committed":33,"step_ms":220.34,"pack_ms":1.12,"draft_ms":139.62,"topk_ms":20.94,"exact_ms":58.62,"exact_decode_ms":0,"acceptance":2.357},"status":"discard","description":"chain decode timing with hidden capture enabled","timestamp":1777507637568,"segment":1,"confidence":12.882552326173203,"asi":{"hypothesis":"if hidden capture is the reason DDTree exact validation llama_decode is ~2.35x slower than target-only chain decode, enabling capture_hidden on the chain reference should raise chain decode_avg toward ~58 ms/token","rollback_reason":"diagnostic-only harness/env pass-through change; primary TPS did not improve over the kept best","result":"with LLAMA_DDTREE_CHAIN_CAPTURE=1, chain decode_avg remained 24.74 ms/token while DDTree exact stayed 58.62 ms/step; hidden capture alone does not explain the exact-validation decode slowdown","next_action_hint":"next isolate seq_rm/KV state mutation or repeated position rollback effects: run target-only chain with seq_rm before each decode, or instrument exact validation to measure llama_memory_seq_rm and per-node decode depth costs."}}
{"run":110,"commit":"9fbce93","metric":10.039594,"metrics":{"tps":10.039594,"e2e_tps":1.291312,"spec_sec":24.781,"gen_tokens":32,"steps":14,"committed":33,"step_ms":227.67,"pack_ms":1.11,"draft_ms":147.57,"topk_ms":20.34,"exact_ms":58.61,"exact_decode_ms":0,"acceptance":2.357},"status":"discard","description":"chain decode timing with hidden capture and seq_rm controls","timestamp":1777507811343,"segment":1,"confidence":12.847363213337257,"asi":{"hypothesis":"if DDTree exact validation is slower because of hidden capture or the seq_rm call before validation, then target-only chain decode with capture_hidden plus seq_rm before every decode should approach the ~58 ms exact-validation cost","rollback_reason":"diagnostic-only harness changes; primary TPS stayed below the kept best and no production optimization was introduced","result":"chain with capture+seq_rm still decoded at 24.81 ms/token; seq_rm itself averaged only 0.026 ms. DDTree exact remained 58.61 ms/step, so neither hidden capture nor a no-op seq_rm explains the slowdown","next_action_hint":"isolate context-state effects specific to DDTree exact validation: repeated decode of root/current positions after previous speculative commits, recurrent/SSM state mutation, or graph shape/reuse differences. Next useful diagnostic is per-depth exact decode timing or chain replay pattern that decodes root+accepted path chunks like validation."}}
{"run":111,"commit":"9fbce93","metric":10.079439,"metrics":{"tps":10.079439,"e2e_tps":1.297911,"spec_sec":24.655,"gen_tokens":32,"steps":14,"committed":33,"step_ms":226.77,"pack_ms":1.11,"draft_ms":146.48,"topk_ms":20.54,"exact_ms":58.6,"exact_decode_ms":58.14,"acceptance":2.357},"status":"discard","description":"normalize exact validation decode cost per node","timestamp":1777507941061,"segment":1,"confidence":12.752562835954159,"asi":{"hypothesis":"the apparent ~58 ms exact-validation cost may be per speculative step, not per target token; logging exact_nodes and exact_decode per node should clarify whether exact llama_decode is actually slower than target-only chain decode","rollback_reason":"diagnostic-only harness logging; primary TPS stayed below the kept best and no runtime optimization was introduced","result":"exact_nodes=33 over 14 steps; exact_decode=58.14 ms/step but exact_node=24.66 ms/token, matching target-only chain decode_avg=24.76 ms/token. Exact validation is not slower per token; it costs about accepted_nodes_per_step target decodes.","next_action_hint":"stop treating exact validation as a per-token slowdown. The remaining throughput gap is structural: each DDTree step pays draft/topk plus ~2.36 target decodes. Next focus on either trusting batched posterior with cheap state commit, reducing target decodes per step, or lowering draft/topk cost."}}
{"run":112,"commit":"9fbce93","metric":10.473398,"metrics":{"tps":10.473398,"e2e_tps":1.308312,"spec_sec":24.459,"gen_tokens":32,"steps":14,"committed":33,"step_ms":218.24,"pack_ms":0.95,"draft_ms":139.62,"topk_ms":19.74,"exact_ms":57.9,"exact_decode_ms":0,"acceptance":2.357},"status":"discard","description":"test smaller target feature window 112 with proposal temp 0.7","timestamp":1777508043255,"segment":1,"confidence":12.651180373150947,"asi":{"hypothesis":"with proposal_temp=0.7 preserving the 14-step acceptance pattern, reducing target_feat_ctx from 128 to 112 may lower draft/pack cost without losing proposal quality","rollback_reason":"target_feat_ctx=112 preserved 14 steps and 2.36 acceptance and reduced pack/draft versus typical 128 runs, but primary TPS did not beat the kept best and remains within the noisy 10.0-11.9 range","result":"112-window produced tps=10.47, step=218.24 ms, draft=139.62 ms, pack=0.95 ms, acceptance=2.36; proposal_temp=0.7 appears to recover the acceptance loss previously seen at 112 without a clear primary improvement","next_action_hint":"test 96 or 104 only if looking for more draft-cost reduction, but compare against noise; avoid declaring smaller windows best until repeated and validated on broader prompts"}}
{"run":113,"commit":"9fbce93","metric":9.938876,"metrics":{"tps":9.938876,"e2e_tps":1.298174,"spec_sec":24.65,"gen_tokens":32,"steps":16,"committed":33,"step_ms":201.23,"pack_ms":0.88,"draft_ms":131.18,"topk_ms":18.74,"exact_ms":50.38,"exact_decode_ms":0,"acceptance":2.062},"status":"discard","description":"test smaller target feature window 104 with proposal temp 0.7","timestamp":1777508138445,"segment":1,"confidence":13.074597480445423,"asi":{"hypothesis":"with proposal_temp=0.7, target_feat_ctx=104 may further reduce draft/pack cost while still preserving enough proposal quality to beat the 128-window exact-validation path","rollback_reason":"104 reduced draft/pack and per-step time, but lost the 14-step acceptance pattern: steps rose to 16 and acceptance fell to 2.06, so primary TPS stayed below the current best","result":"target_feat_ctx=104 produced step=201.23 ms, draft=131.18 ms, pack=0.88 ms, but required 16 steps; smaller windows below 112 start trading away the structural acceptance gain","next_action_hint":"avoid shrinking target_feat_ctx below 112 for this prompt/temp unless validating a latency-focused variant; next try broader-prompt batched/exact confidence validation or target_tree/state-commit work rather than more small window tuning"}}
{"type":"config","name":"Optimize DFlash DDTree decode TPS on Castle - Phase 2: Batched posterior + fast commit","metricName":"tps","metricUnit":"","bestDirection":"higher"}
{"run":114,"commit":"166b674","metric":9.411626,"metrics":{},"status":"keep","description":"Phase 2 baseline: proposal_temp=0.7, budget=32, target_feat_ctx=128, exact-validation path on real_rendered_prompt","timestamp":1777537489249,"segment":2,"confidence":null,"asi":{"hypothesis":"establish new segment baseline with current best config before pursuing batched posterior validation and fast commit optimizations","benchmark":"./autoresearch.sh on Castle; gen=32; prompt=/tmp/real_rendered_prompt.txt; LLAMA_DDTREE_PROPOSAL_TEMP=0.7; LLAMA_DDTREE_TARGET_FEAT_CTX=128","note":"steps=15 vs previous segment best 14-step runs; within noise band 9.4-10.5 TPS"}}
{"run":115,"commit":"9fbce93","metric":7.28014,"metrics":{},"status":"discard","description":"fast-batched + fast-rollback with budget 10 at 64k context","timestamp":1777541038238,"segment":2,"confidence":null,"asi":{"hypothesis":"test whether fast-batched path with small budget 10 can achieve acceptable TPS without exact validation","note":"fast rollback fits at budget10 with n_batch=256; acceptance=1.94, step=258ms, TPS=7.28 vs baseline 9.41","next_action_hint":"budget10 fast-batched is worse than exact path; try smaller budgets or combine with other optimizations"}}
{"run":116,"commit":"9fbce93","metric":7.136072,"metrics":{},"status":"discard","description":"fast-batched + fast-rollback with budget 14 at 64k context","timestamp":1777541046222,"segment":2,"confidence":null,"asi":{"hypothesis":"test whether budget 14 fast-batched improves over budget 10 with better acceptance","note":"budget 14 step time increased to 264ms but acceptance stayed 1.94; TPS=7.14 worse than budget 10","next_action_hint":"larger budgets increase target_tree cost without acceptance gain in fast-batched mode; smaller budgets may be better"}}
{"run":117,"commit":"9fbce93","metric":7.851335,"metrics":{},"status":"discard","description":"fast-batched + fast-rollback with budget 8 at 64k context","timestamp":1777541054565,"segment":2,"confidence":null,"asi":{"hypothesis":"test whether budget 8 fast-batched achieves best TPS among small-budget fast-rollback fits","next_action_hint":"budget 8 is the best fast-batched config so far (7.85 TPS) but still below exact path; target_tree decode dominates fast-batched cost","note":"budget 8 step=226ms, acceptance=1.83, draft=135ms; fast-batched path consistently slower than exact validation for this implementation"}}
{"run":118,"commit":"ade211a","metric":10.195434,"metrics":{"e2e_tps":1.306336,"spec_sec":24.496,"gen_tokens":32,"steps":14,"committed":33,"step_ms":224.19,"pack_ms":1.16,"draft_ms":143.91,"topk_ms":20.75,"exact_ms":58.32,"acceptance":2.357},"status":"keep","description":"test DDTree top_k=4 with proposal temp 0.7 exact-validation path","timestamp":1777543025604,"segment":2,"confidence":1.0958318828179303,"asi":{"benchmark":"./autoresearch.sh; prompt=/tmp/real_rendered_prompt.txt; LLAMA_DDTREE_PROPOSAL_TEMP=0.7; AUTORESEARCH_BUDGET=32; LLAMA_DDTREE_TOP_K=4","hypothesis":"reducing DDTree top-k cardinality from default 8 to 4 may lower CPU top-k scan cost without losing acceptance on this prompt","next_action_hint":"repeat K=4 and test K=2 to see if smaller K continues to help or hurts acceptance; if stable, consider defaulting to K=4","note":"first K=4 run gave 10.20 TPS, repeat gave 10.03; both above 9.41 baseline but within historical 9.4-10.5 noise band; draft/step time slightly lower than baseline"}}
{"run":119,"commit":"e385d50","metric":10.147005,"metrics":{"acceptance":2.357,"committed":33,"draft_ms":145.55,"e2e_tps":1.302932,"exact_ms":58.27,"gen_tokens":32,"pack_ms":0.94,"spec_sec":24.56,"step_ms":225.26,"steps":14,"topk_ms":20.47},"status":"keep","description":"test target_feat_ctx=112 with top_k=4 and proposal_temp=0.7 exact path","timestamp":1777545728081,"segment":2,"confidence":0.550668850678441,"asi":{"hypothesis":"smaller target feature window 112 may reduce draft compute without losing acceptance versus 128","next_action_hint":"test ctx=96 or 104 to find the lower bound; if none beat 128, keep 128 as default","note":"ctx=112 produced 10.15 TPS, step=225ms, draft=145ms, acceptance=2.36; within noise of ctx=128 best"}}
{"run":120,"commit":"ff511ad","metric":9.892726,"metrics":{"acceptance":2.429,"committed":34,"draft_ms":147.88,"e2e_tps":0.951446,"exact_ms":61.21,"gen_tokens":32,"pack_ms":1.11,"spec_sec":33.633,"step_ms":231.05,"steps":14,"topk_ms":20.82},"status":"keep","description":"exact path with n_batch=64 to validate per-layer persist allocation does not regress baseline","timestamp":1777546971003,"segment":2,"confidence":1,"asi":{"hypothesis":"validate that per-layer persist allocation + n_batch=64 does not break exact path correctness or TPS","next_action_hint":"per-layer allocation is committed and safe; continue exploring fast-batched optimization or server integration","note":"n_batch=64 exact path achieved 9.89 TPS, slightly below n_batch=512 best (10.15) but within noise; per-layer allocation code is working"}}
{"run":121,"commit":"ff511ad","metric":10.188617,"metrics":{"acceptance":2.357,"committed":33,"draft_ms":143.93,"e2e_tps":1.307029,"exact_ms":58.34,"gen_tokens":32,"pack_ms":1.11,"spec_sec":24.483,"step_ms":224.34,"steps":14,"topk_ms":20.92},"status":"discard","description":"exact path budget 40 with top_k=4 and proposal_temp=0.7 to test acceptance saturation beyond budget 32","timestamp":1777547541311,"segment":2,"confidence":1.451900946467489,"asi":{"hypothesis":"budget 40 provides more tree nodes than budget 32, potentially increasing acceptance per step enough to reduce total steps and improve overall TPS","rollback_reason":"budget 40 produced identical acceptance (2.357), steps (14), and committed (33) as budget 32, with nearly identical step time (~224ms); extra budget nodes do not improve proposal quality on this prompt","next_action_hint":"acceptance is saturated at ~2.36 for this prompt with budget >=32; try budget 28 or 30 to see if lower budget achieves same acceptance with less overhead, or switch to cross-prompt validation or server integration testing"}}
{"run":122,"commit":"974eb18","metric":11.808554,"metrics":{"acceptance":2.267,"committed":34,"draft_ms":106.3,"e2e_tps":1.32714,"exact_ms":55.5,"gen_tokens":32,"pack_ms":1.06,"spec_sec":24.112,"step_ms":180.66,"steps":15,"topk_ms":17.77},"status":"keep","description":"exact path budget 28 with top_k=4 and proposal_temp=0.7 — large draft speedup vs budget 32","timestamp":1777547640327,"segment":2,"confidence":4.982182498441078,"asi":{"hypothesis":"budget 28 may reduce draft tree-compute overhead more than it hurts acceptance, yielding better TPS than budget 32 despite slightly lower per-step acceptance","next_action_hint":"repeat budget 28 immediately to validate whether the 106ms draft time and 11.81 TPS are stable or noise; if repeatable, this is a major finding","note":"budget 28 step time dropped to 180.66ms (from ~224ms at budget 32), driven by draft_ms falling to 106.3ms (from ~144ms); acceptance slightly lower at 2.267 vs 2.357 but more than offset by per-step speedup"}}
{"run":123,"commit":"974eb18","metric":10.144721,"metrics":{"acceptance":2.267,"committed":34,"draft_ms":133.06,"e2e_tps":1.307831,"exact_ms":56.01,"gen_tokens":32,"pack_ms":1.07,"spec_sec":24.468,"step_ms":210.29,"steps":15,"topk_ms":20.12},"status":"discard","description":"repeat budget 28 to validate the 11.81 TPS result","timestamp":1777547746753,"segment":2,"confidence":6.1161100677716975,"asi":{"hypothesis":"repeat budget 28 to check whether the 11.81 TPS run was stable or noise","next_action_hint":"budget 28 repeat at 10.14 TPS is comparable to budget 32 (10.20 TPS); try budget 30 as the intermediate sweet spot, or investigate why draft time varies so widely (106-144ms)","note":"repeat budget 28 draft_ms=133.06ms, step_ms=210.29ms, TPS=10.14 — much closer to budget 32 than the first run's 106ms/180ms/11.81; first run was likely an outlier","rollback_reason":"10.14 TPS is slightly below the kept budget 32 best of 10.20 TPS; budget 28 is not clearly better"}}
{"run":124,"commit":"ff511ad","metric":10.068782,"metrics":{"acceptance":2.286,"committed":32,"draft_ms":148.42,"e2e_tps":1.304472,"exact_ms":56.42,"gen_tokens":32,"pack_ms":1.09,"spec_sec":24.531,"step_ms":227.01,"steps":14,"topk_ms":21.04},"status":"discard","description":"exact path budget 30 with top_k=4 and proposal_temp=0.7 — intermediate sweep between 28 and 32","timestamp":1777547834290,"segment":2,"confidence":13.614577179988123,"asi":{"hypothesis":"budget 30 may be the sweet spot between budget 28 and 32, preserving acceptance while reducing overhead","rollback_reason":"budget 30 produced lower acceptance (2.286 vs 2.357), higher draft time (148ms vs 144ms), and lower TPS (10.07 vs 10.20) than budget 32; budget 32 remains the best exact-path configuration","next_action_hint":"budget 32 is the exact-path sweet spot; validate on a different prompt to check for overfitting, or try top_k=6, or explore reducing exact validation cost per step"}}
{"run":125,"commit":"ff511ad","metric":14.684288,"metrics":{"acceptance":2.4,"committed":12,"draft_ms":84.62,"e2e_tps":11.472275,"exact_ms":57.61,"gen_tokens":12,"pack_ms":0.37,"spec_sec":1.046,"step_ms":163.44,"steps":5,"topk_ms":20.8},"status":"discard","description":"cross-prompt validation on qwen_rendered_prompt.txt with best config — only 12 tokens generated, not comparable to 32-token benchmark","timestamp":1777547880051,"segment":2,"confidence":5.272893061768601,"asi":{"hypothesis":"validate best config (budget 32, top_k=4, proposal_temp=0.7) on a different prompt to check for overfitting","next_action_hint":"this prompt only generated 12 tokens (EOS early), making it incomparable to the 32-token benchmark; try a different prompt that generates closer to 32 tokens, such as task761 or task0","note":"config works correctly on different prompt (bit-equal PASS) with acceptance=2.40, but short generation makes TPS incomparable"}}
{"run":126,"commit":"ff511ad","metric":7.524101,"metrics":{"acceptance":1.65,"committed":33,"draft_ms":149.57,"e2e_tps":1.181422,"exact_ms":41.2,"gen_tokens":32,"pack_ms":1.03,"spec_sec":27.086,"step_ms":212.65,"steps":20,"topk_ms":20.82},"status":"discard","description":"cross-prompt validation on task761 (21k tokens) with best config — acceptance drops to 1.65, TPS falls to 7.52","timestamp":1777547973644,"segment":2,"confidence":3.647426181911143,"asi":{"hypothesis":"validate best config on a different prompt to check for overfitting","next_action_hint":"acceptance varies significantly by prompt (1.65 vs 2.36); test whether higher budget or top_k=6 improves harder prompts without hurting the benchmark prompt","note":"task761 generated 32 tokens in 20 steps (vs 14 on real_rendered), acceptance 1.65; draft model effectiveness varies by prompt; config is correct (bit-equal PASS) but slower on this prompt","rollback_reason":"7.52 TPS is below baseline on this prompt, but this is a cross-validation run rather than a benchmark regression; the benchmark prompt remains the optimization target"}}
{"run":127,"commit":"ff511ad","metric":9.903871,"metrics":{"acceptance":2.357,"committed":33,"draft_ms":149.32,"e2e_tps":1.293766,"exact_ms":58.77,"gen_tokens":32,"pack_ms":1.11,"spec_sec":24.734,"step_ms":230.79,"steps":14,"topk_ms":21.56},"status":"discard","description":"exact path top_k=2 with budget 32 and proposal_temp=0.7 — acceptance same as top_k=4 but slightly slower","timestamp":1777548247599,"segment":2,"confidence":6.1161100677716975,"asi":{"hypothesis":"top_k=2 may reduce topk overhead while maintaining acceptance if the extra K=4 candidates are unused on this prompt","next_action_hint":"top_k=2 acceptance is identical to top_k=4 on this prompt (2.357), but draft/topk noise makes it slightly slower; try n_ctx=32k to see if reduced KV cache pressure helps target decode, or test top_k=4 on task761 to check cross-prompt generalization","rollback_reason":"9.90 TPS is below the kept top_k=4 best of 10.20 TPS; top_k=2 is not better on the benchmark prompt","note":"acceptance unchanged at 2.357, confirming that extra K=4 candidates do not improve acceptance on this prompt; draft time 149ms vs 144ms is within noise"}}
{"run":128,"commit":"ff511ad","metric":10.212288,"metrics":{"acceptance":2.357,"committed":33,"draft_ms":143.97,"e2e_tps":1.301236,"exact_ms":58.07,"gen_tokens":32,"pack_ms":1.09,"spec_sec":24.592,"step_ms":223.82,"steps":14,"topk_ms":20.66},"status":"discard","description":"exact path with n_ctx=32768 to test if reduced KV cache allocation improves TPS","timestamp":1777548345195,"segment":2,"confidence":13.614577179988123,"asi":{"hypothesis":"n_ctx=32768 reduces KV cache allocation overhead compared to 65536, potentially improving target decode speed","next_action_hint":"n_ctx has no meaningful effect on exact-path TPS; try draft GPU layers 5 or explore exact-validation batching/parallelization","note":"n_ctx=32k produced 10.21 TPS, essentially identical to 64k (10.20); KV cache pressure is not the bottleneck for this prompt length (20k tokens)","rollback_reason":"10.21 TPS is indistinguishable from 10.20 best; no improvement from halving n_ctx"}}
{"run":129,"commit":"ff511ad","metric":8.558163,"metrics":{"acceptance":2.429,"committed":34,"draft_ms":187.64,"e2e_tps":1.283028,"exact_ms":59.93,"gen_tokens":32,"pack_ms":1.1,"spec_sec":24.941,"step_ms":267.08,"steps":14,"topk_ms":18.36},"status":"discard","description":"exact path with draft GPU layers 5 — one layer off GPU hurts draft time significantly","timestamp":1777548448213,"segment":2,"confidence":5.987365455085925,"asi":{"hypothesis":"draft GPU layers 5 may reduce GPU memory contention or improve CPU-GPU balance","next_action_hint":"ngld6 is optimal for exact path; do not reduce draft offload further. Next explore exact-validation batching/parallelization or target decode call overhead reduction.","note":"draft_ms jumped to 187.64ms from ~144ms, step_ms to 267ms; moving draft layers off GPU hurts more than any acceptance gain","rollback_reason":"8.56 TPS is far below the 10.20 best; draft GPU offload must stay at 6 for exact path performance"}}
{"run":130,"commit":"974eb18","metric":9.884597,"metrics":{"acceptance":2.357,"committed":33,"draft_ms":150.8,"e2e_tps":1.298069,"exact_ms":58.39,"gen_tokens":32,"pack_ms":1.11,"spec_sec":24.652,"step_ms":231.24,"steps":14,"topk_ms":20.91},"status":"discard","description":"exact path with chain_seed=0 (pure best-first) and optimized params — chain seed still helps","timestamp":1777548855529,"segment":2,"confidence":7.771711676074962,"asi":{"hypothesis":"with top_k=4 and proposal_temp=0.7, pure best-first might produce a better tree than chain seed","next_action_hint":"chain seed remains beneficial; do not disable it. Next try gen=64 diagnostic or explore exact-validation batching code optimization.","note":"chain_seed=0 produced 9.88 TPS vs 10.20 with default chain seed; draft_ms=150.8 vs ~144ms; chain seed improves tree quality enough to offset any overhead","rollback_reason":"9.88 TPS below best; chain seed default is correct"}}
{"run":131,"commit":"974eb18","metric":9.720872,"metrics":{"acceptance":2.1,"committed":42,"draft_ms":143.17,"e2e_tps":1.631702,"exact_ms":51.79,"gen_tokens":42,"pack_ms":1.03,"spec_sec":25.74,"step_ms":216.03,"steps":20,"topk_ms":20},"status":"discard","description":"diagnostic run with gen=64 to check TPS over longer generation","timestamp":1777548960687,"segment":2,"confidence":7.844313356514611,"asi":{"hypothesis":"longer generation may reveal acceptance degradation or step time changes not visible in the 32-token benchmark","next_action_hint":"acceptance degrades from 2.36 to 2.10 over longer generation; draft/step time stays stable. Focus on per-step optimizations rather than prompt-length tuning.","note":"gen_tokens=42 (EOS at 42), steps=20, acceptance=2.10, step_ms=216ms; TPS=9.72 vs 10.2 for gen=32; draft model accuracy degrades with longer context, as expected"}}
{"run":132,"commit":"974eb18","metric":9.979978,"metrics":{"acceptance":2.357,"committed":33,"draft_ms":148.91,"e2e_tps":1.293191,"exact_ms":58.56,"gen_tokens":32,"pack_ms":1.1,"spec_sec":24.745,"step_ms":229.03,"steps":14,"topk_ms":20.43},"status":"discard","description":"exact path with K=1 to test whether topk_ms is CPU-bound or sync-bound","timestamp":1777549060523,"segment":2,"confidence":8.220960821503416,"asi":{"hypothesis":"K=1 argmax should be much faster than K=4 logsumexp+heap if CPU computation dominates topk_ms","next_action_hint":"topk_ms is dominated by GPU sync/wait, not CPU computation (K=1=20.4ms vs K=4=20.9ms). Do not optimize CPU top-k further. Focus on draft decode speed or exact-validation batching.","note":"K=1 topk_ms=20.43ms, K=4 topk_ms=20.92ms — difference is only 0.5ms. The ~20ms is mostly GPU synchronization tail from draft decode. Total draft+topk is ~169ms regardless of K.","rollback_reason":"K=1 does not improve topk_ms meaningfully; TPS=9.98 below best of 10.20"}}
{"run":133,"commit":"974eb18","metric":9.720387,"metrics":{"acceptance":2.267,"committed":34,"draft_ms":149.93,"e2e_tps":1.29728,"exact_ms":55.65,"gen_tokens":32,"pack_ms":1.08,"spec_sec":24.667,"step_ms":219.47,"steps":15,"topk_ms":12.79},"status":"discard","description":"exact path with block_size=8 override — draft time unchanged, acceptance drops","timestamp":1777549316541,"segment":2,"confidence":8.160393835091734,"asi":{"hypothesis":"reducing block_size from 16 to 8 should halve draft compute if draft time scales with tokens","next_action_hint":"draft time is NOT proportional to block_size; the SSM tree kernel likely has fixed overhead per step. Revert block_size override and try other optimizations.","note":"draft_ms=149.93 with block_size=8 vs ~144ms with block_size=16 — essentially the same. Acceptance dropped to 2.267 (from 2.357) because shallower tree has fewer candidate paths.","rollback_reason":"block_size=8 hurts acceptance without improving draft speed; not a useful optimization"}}
{"run":134,"commit":"974eb18","metric":10.074996,"metrics":{"acceptance":2.357,"committed":33,"draft_ms":148.2,"e2e_tps":1.297859,"exact_ms":58.21,"gen_tokens":32,"pack_ms":1.09,"spec_sec":24.656,"step_ms":226.87,"steps":14,"topk_ms":19.34},"status":"discard","description":"exact path with KV type q8_0 — similar performance to q4_0","timestamp":1777549692319,"segment":2,"confidence":8.417775842329654,"asi":{"hypothesis":"q8_0 KV cache may improve cache locality or reduce dequantization overhead vs q4_0","next_action_hint":"KV cache type has no meaningful effect on TPS; try other optimizations like target GPU layers or draft model quantization","note":"q8_0 produced 10.07 TPS, essentially identical to q4_0 (10.20); correctness passes; KV type is not a bottleneck","rollback_reason":"no improvement over q4_0 baseline"}}
{"run":135,"commit":"b4033f4","metric":10.364171,"metrics":{"acceptance":2.357,"committed":33,"draft_ms":142.29,"e2e_tps":1.309329,"exact_ms":57.78,"gen_tokens":32,"pack_ms":1.1,"spec_sec":24.44,"step_ms":220.54,"steps":14,"topk_ms":19.33},"status":"keep","description":"exact path with target GPU layers 80 — potential improvement over 65","timestamp":1777549798463,"segment":2,"confidence":9.583840128587989,"asi":{"hypothesis":"increasing target GPU layers from 65 to 80 may offload remaining CPU layers and improve target decode speed","next_action_hint":"repeat with n_gpu_layers=80 to confirm stability, and test n_gpu_layers=99 to check for further gains","note":"TPS=10.36, draft_ms=142.29, step_ms=220.54 — slightly better than n_gpu_layers=65 best of 10.20; could be noise or real improvement from better GPU utilization"}}
{"run":136,"commit":"b4033f4","metric":10.053725,"metrics":{"acceptance":2.357,"committed":33,"draft_ms":147.13,"e2e_tps":1.302189,"exact_ms":58.7,"gen_tokens":32,"pack_ms":1.09,"spec_sec":24.574,"step_ms":227.35,"steps":14,"topk_ms":20.39},"status":"discard","description":"exact path with target GPU layers 99 — worse than 80, possibly noise or memory contention","timestamp":1777549879409,"segment":2,"confidence":10.317799492058029,"asi":{"hypothesis":"n_gpu_layers=99 may fully offload all layers, potentially improving over 80","next_action_hint":"n_gpu_layers=80 appears best among tested values; repeat n_gpu_layers=80 for stability","rollback_reason":"10.05 TPS below n_gpu_layers=80 best of 10.36; 99 may cause memory fragmentation or is within noise","note":"step_ms=227.35, draft_ms=147.13 at n_gpu_layers=99 vs step_ms=220.54, draft_ms=142.29 at n_gpu_layers=80"}}
{"run":137,"commit":"b4033f4","metric":9.495718,"metrics":{"acceptance":2.357,"committed":33,"draft_ms":159.25,"e2e_tps":1.288297,"exact_ms":58.13,"gen_tokens":32,"pack_ms":1.11,"spec_sec":24.839,"step_ms":240.71,"steps":14,"topk_ms":22.19},"status":"discard","description":"repeat n_gpu_layers=80 to check stability — regressed to 9.50 TPS, confirming first run was noise","timestamp":1777549973455,"segment":2,"confidence":9.583840128587989,"asi":{"hypothesis":"repeat n_gpu_layers=80 to confirm the 10.36 TPS improvement was stable","next_action_hint":"n_gpu_layers parameter does not meaningfully affect TPS; the 10.36 run was noise. Return to exploring exact-validation batching or other code optimizations.","note":"repeat draft_ms=159.25, step_ms=240.71, TPS=9.50 — much worse than first run; GPU thermal/scheduling noise dominates small parameter changes","rollback_reason":"9.50 TPS well below best; n_gpu_layers=80 is not a real improvement"}}
{"run":138,"commit":"974eb18","metric":0,"metrics":{"acceptance":0,"committed":0,"draft_ms":0,"e2e_tps":0,"exact_ms":0,"gen_tokens":0,"pack_ms":0,"spec_sec":0,"step_ms":0,"steps":0,"topk_ms":0},"status":"crash","description":"exact path with no flash attention — fails because q4_0 KV cache requires flash_attn","timestamp":1777550079676,"segment":2,"confidence":9.583840128587989,"asi":{"hypothesis":"disabling flash attention might reduce overhead for small batch decodes","next_action_hint":"flash attention is required for this config; do not test without it","rollback_reason":"V cache quantization requires flash_attn; cannot disable flash attention with q4_0 KV cache"}}
{"run":139,"commit":"974eb18","metric":9.686461,"metrics":{"acceptance":2.429,"committed":34,"draft_ms":154.37,"e2e_tps":1.301607,"exact_ms":59.7,"gen_tokens":32,"pack_ms":1.14,"spec_sec":24.585,"step_ms":235.97,"steps":14,"topk_ms":20.73},"status":"discard","description":"exact path with draft_n_ctx capped to 512 — works correctly but slightly slower","timestamp":1777550273300,"segment":2,"confidence":8.417775842329654,"asi":{"hypothesis":"reducing draft_n_ctx from 4096 to 512 may reduce memory pressure and improve draft speed","next_action_hint":"draft_n_ctx can be as low as 512 without breaking correctness, but does not improve TPS. Revert and focus on other optimizations.","note":"draft_n_ctx=512 passed correctness but draft_ms=154.37 vs ~144ms baseline; no speedup","rollback_reason":"9.69 TPS below 10.20 best; draft_n_ctx reduction does not improve performance"}}
{"run":140,"commit":"974eb18","metric":9.452913,"metrics":{"acceptance":2.357,"committed":33,"draft_ms":160.47,"e2e_tps":1.286225,"exact_ms":58.18,"gen_tokens":32,"pack_ms":1.09,"spec_sec":24.879,"step_ms":241.8,"steps":14,"topk_ms":22.02},"status":"discard","description":"exact path with block_size=4 — draft time increases, not helpful","timestamp":1777550372252,"segment":2,"confidence":8.160393835091734,"asi":{"hypothesis":"block_size=4 should reduce draft compute if proportional to tokens","next_action_hint":"draft time has fixed floor around 140-160ms regardless of block_size; do not change block_size","note":"block_size=4 draft_ms=160.47 vs ~144ms at 16; acceptance unchanged at 2.357","rollback_reason":"worse TPS and higher draft time"}}
{"run":141,"commit":"974eb18","metric":9.433795,"metrics":{"acceptance":2.429,"committed":34,"draft_ms":158.73,"e2e_tps":1.286691,"exact_ms":60.06,"gen_tokens":32,"pack_ms":1.38,"spec_sec":24.87,"step_ms":242.29,"steps":14,"topk_ms":22.09},"status":"discard","description":"exact path with target_feat_ctx=160 — slightly higher acceptance but much higher draft time","timestamp":1777550473024,"segment":2,"confidence":7.918284287167812,"asi":{"hypothesis":"larger target feature window 160 may improve draft model accuracy","next_action_hint":"target_feat_ctx=128 remains the sweet spot; 160 increases draft time more than acceptance gain","note":"acceptance=2.429 vs 2.357 but draft_ms=158.73 vs ~144ms; net TPS worse","rollback_reason":"9.43 TPS below 10.20 best"}}
{"run":142,"commit":"b4033f4","metric":10.018032,"metrics":{"acceptance":2.357,"committed":33,"draft_ms":148.43,"e2e_tps":1.257664,"exact_ms":57.94,"gen_tokens":32,"pack_ms":1.1,"spec_sec":25.444,"step_ms":228.16,"steps":14,"topk_ms":20.66},"status":"discard","description":"exact path with actual top_k=4 — same as K=8 on this prompt","timestamp":1777550692217,"segment":2,"confidence":8.160393835091734,"asi":{"hypothesis":"actual top_k=4 may reduce topk overhead while maintaining acceptance","next_action_hint":"K=4 acceptance identical to K=8; test K=2 to find the lower bound","note":"actual K=4 produces same acceptance (2.357) and similar timing to K=8; extra candidates 5-8 are unused on this prompt"}}
{"run":143,"commit":"b4033f4","metric":9.697851,"metrics":{"acceptance":2.2,"committed":33,"draft_ms":145.8,"e2e_tps":1.295442,"exact_ms":53.86,"gen_tokens":32,"pack_ms":1.07,"spec_sec":24.702,"step_ms":219.98,"steps":15,"topk_ms":19.22},"status":"discard","description":"exact path with actual top_k=2 — acceptance drops to 2.20, requiring 15 steps","timestamp":1777550958043,"segment":2,"confidence":8.10071276246994,"asi":{"hypothesis":"actual top_k=2 may reduce overhead without losing too much acceptance","next_action_hint":"K=2 hurts acceptance (2.20 vs 2.36); K=4 matches K=8. Test K=4 again for confirmation.","note":"actual K=2: acceptance=2.20, steps=15, TPS=9.70; K=4 and K=8 both give 2.36/14 steps"}}
{"run":144,"commit":"74b01e6","metric":10.254438,"metrics":{"acceptance":2.357,"committed":33,"draft_ms":144.06,"e2e_tps":1.306389,"exact_ms":57.84,"gen_tokens":32,"pack_ms":1.11,"spec_sec":24.495,"step_ms":222.9,"steps":14,"topk_ms":19.86},"status":"keep","description":"exact path with actual top_k=4 — matches K=8 acceptance with less CPU work, new best TPS","timestamp":1777551092551,"segment":2,"confidence":8.160393835091734,"asi":{"hypothesis":"actual top_k=4 matches K=8 acceptance while using less CPU and memory for top-k extraction","next_action_hint":"K=4 is validated as equivalent to K=8 on this prompt; continue exploring other optimizations like exact-validation batching or server integration","note":"code changes for --ddtree-top-k flag and top_k parameter are now committed; K=4 acceptance=2.357, steps=14, TPS=10.25"}}
{"run":145,"commit":"74b01e6","metric":9.812038,"metrics":{"acceptance":2.357,"committed":33,"draft_ms":152.96,"e2e_tps":1.287208,"exact_ms":58.2,"gen_tokens":32,"pack_ms":1.1,"spec_sec":24.86,"step_ms":232.95,"steps":14,"topk_ms":20.65},"status":"discard","description":"repeat best config to check noise — draft time 153ms, TPS drops to 9.81","timestamp":1777551308662,"segment":2,"confidence":8.10071276246994,"asi":{"hypothesis":"repeat best config to establish noise floor","next_action_hint":"run one more repeat to confirm noise band, then try a different approach if no stable improvement is found","note":"draft_ms varies from 144ms to 153ms between runs; TPS range 9.8-10.3; noise floor is about 0.5 TPS","rollback_reason":"9.81 TPS below best of 10.25; within noise"}}
{"run":146,"commit":"6a23e9f","metric":0,"metrics":{"e2e_tps":0,"spec_sec":0,"gen_tokens":0,"steps":0,"committed":0,"step_ms":0,"pack_ms":0,"draft_ms":0,"topk_ms":0,"exact_ms":0,"acceptance":0},"status":"crash","description":"attempted repeat best config with inline env but harness rejected custom command form","timestamp":1777556393600,"segment":2,"confidence":8.10071276246994,"asi":{"hypothesis":"repeat pure best-first K=4 best config to confirm stability","rollback_reason":"run_experiment enforces direct ./autoresearch.sh invocation when autoresearch.sh exists and rejected inline environment assignment","next_action_hint":"temporarily edit autoresearch.sh defaults to the target config, then run exactly ./autoresearch.sh"}}
{"run":147,"commit":"6a23e9f","metric":0,"metrics":{"e2e_tps":0,"spec_sec":0,"gen_tokens":0,"steps":0,"committed":0,"step_ms":0,"pack_ms":0,"draft_ms":0,"topk_ms":0,"exact_ms":0,"acceptance":0},"status":"crash","description":"attempted direct script run with cd prefix but harness still rejected command form","timestamp":1777556433889,"segment":2,"confidence":8.10071276246994,"asi":{"hypothesis":"run ./autoresearch.sh after encoding best defaults in the script for repeat validation","rollback_reason":"run_experiment requires the command string to be exactly ./autoresearch.sh or bash autoresearch.sh; cd prefix is rejected","next_action_hint":"invoke run_experiment with command exactly ./autoresearch.sh and rely on harness working directory, or add a root-level wrapper only if exact command cannot find the script"}}
{"run":148,"commit":"de24439","metric":10.589855,"metrics":{"e2e_tps":1.303038,"spec_sec":24.558,"gen_tokens":32,"steps":14,"committed":33,"step_ms":215.84,"pack_ms":1.14,"draft_ms":137.23,"topk_ms":19.59,"exact_ms":57.84,"acceptance":2.357},"status":"keep","description":"repeat pure best-first K=4 defaults in autoresearch.sh — stable new best","timestamp":1777556713769,"segment":2,"confidence":8.160393835091734,"asi":{"hypothesis":"encoding the current best exact-path configuration as harness defaults and repeating pure best-first K=4 should confirm whether the 10.33 TPS run was stable","result":"repeat improved to 10.59 TPS with same 14-step acceptance pattern; draft_ms dropped to 137ms and exact_ms remained 57.8ms","next_action_hint":"use these defaults as the new baseline; next test whether budget can be reduced under pure best-first K=4, or validate on task761 to avoid overfitting"}}
{"run":149,"commit":"de24439","metric":9.866251,"metrics":{"e2e_tps":1.293086,"spec_sec":24.747,"gen_tokens":32,"steps":14,"committed":32,"step_ms":231.67,"pack_ms":1.13,"draft_ms":153.82,"topk_ms":20.37,"exact_ms":56.32,"acceptance":2.286},"status":"discard","description":"pure best-first K=4 with budget 28 — acceptance drops and draft time rises","timestamp":1777556801426,"segment":2,"confidence":8.10071276246994,"asi":{"hypothesis":"with pure best-first K=4, budget 28 might preserve the 14-step path while reducing draft/tree overhead versus budget 32","rollback_reason":"budget 28 lost one committed token, lowered acceptance to 2.286, and draft_ms increased to 153.8ms; primary TPS fell to 9.87 below the 10.59 best","next_action_hint":"do not reduce budget below 32 for this pure best-first config; test budget 36/40 only if looking for acceptance gains, or validate best defaults on a harder prompt"}}
{"run":150,"commit":"de24439","metric":10.244327,"metrics":{"e2e_tps":1.298807,"spec_sec":24.638,"gen_tokens":32,"steps":14,"committed":33,"step_ms":223.12,"pack_ms":1.12,"draft_ms":144.18,"topk_ms":20.05,"exact_ms":57.73,"acceptance":2.357},"status":"discard","description":"pure best-first K=4 with budget 36 — no acceptance gain over budget 32","timestamp":1777556896855,"segment":2,"confidence":8.160393835091734,"asi":{"hypothesis":"budget 36 may add useful branches under pure best-first K=4 and improve acceptance enough to offset extra overhead","rollback_reason":"budget 36 produced the same 14 steps, 33 committed tokens, and 2.357 acceptance as budget 32, while step_ms rose to 223ms and TPS fell to 10.24","next_action_hint":"budget 32 remains the sweet spot on this prompt; reset the harness default to 32 before further experiments"}}
{"run":151,"commit":"de24439","metric":9.803621,"metrics":{"e2e_tps":1.275815,"spec_sec":25.082,"gen_tokens":32,"steps":14,"committed":33,"step_ms":233.15,"pack_ms":0.98,"draft_ms":153.76,"topk_ms":20.33,"exact_ms":58.05,"acceptance":2.357},"status":"discard","description":"pure best-first K=4 with target_feat_ctx 112 — pack lower but draft slower","timestamp":1777556989888,"segment":2,"confidence":8.10071276246994,"asi":{"hypothesis":"target_feat_ctx 112 might lower feature packing and draft compute while pure best-first K=4 preserves the 14-step acceptance pattern","rollback_reason":"ctx112 kept acceptance but draft_ms rose to 153.8ms and step_ms to 233ms, dropping TPS to 9.80; the smaller pack cost did not translate to faster draft decode","next_action_hint":"keep target_feat_ctx 128; avoid further small-window tuning unless measuring multiple repeats or a different prompt"}}
{"run":152,"commit":"de24439","metric":9.751044,"metrics":{"e2e_tps":1.293086,"spec_sec":24.747,"gen_tokens":32,"steps":15,"committed":34,"step_ms":218.78,"pack_ms":1.09,"draft_ms":141.95,"topk_ms":20.11,"exact_ms":55.6,"acceptance":2.267},"status":"discard","description":"pure best-first K=4 with proposal_temp 0.5 — fewer accepted per step","timestamp":1777557081428,"segment":2,"confidence":8.586538754323378,"asi":{"hypothesis":"lower proposal_temp 0.5 may sharpen best-first scores and favor higher-confidence paths, possibly reducing exact validation work","rollback_reason":"temp 0.5 required 15 steps with acceptance 2.267 versus 14 steps at temp 0.7; lower per-step exact cost did not offset the extra step","next_action_hint":"keep proposal_temp 0.7; if sweeping temp, try slightly higher 0.85/1.0 rather than lower"}}
{"run":153,"commit":"de24439","metric":10.500824,"metrics":{"e2e_tps":1.308472,"spec_sec":24.456,"gen_tokens":32,"steps":14,"committed":33,"step_ms":217.67,"pack_ms":1.12,"draft_ms":138.2,"topk_ms":20.33,"exact_ms":57.98,"acceptance":2.357},"status":"discard","description":"pure best-first K=4 with proposal_temp 0.85 — close but below 0.7 best","timestamp":1777557170324,"segment":2,"confidence":8.10071276246994,"asi":{"hypothesis":"slightly higher proposal_temp 0.85 may diversify pure best-first branches without changing the 14-step acceptance pattern","rollback_reason":"temp 0.85 preserved acceptance and was fast, but TPS 10.50 remained below the kept 10.59 run at temp 0.7","next_action_hint":"0.7 and 0.85 are within noise; keep 0.7 as current best and test a structurally different knob such as tree row count or exact validation batching"}}
{"run":154,"commit":"de24439","metric":10.038712,"metrics":{"e2e_tps":1.290895,"spec_sec":24.789,"gen_tokens":32,"steps":14,"committed":33,"step_ms":227.69,"pack_ms":1.12,"draft_ms":147.82,"topk_ms":20.69,"exact_ms":58.03,"acceptance":2.357},"status":"discard","description":"reuse draft llama_batch across steps — allocation removal does not improve TPS","timestamp":1777557336806,"segment":2,"confidence":8.8935689439265,"asi":{"hypothesis":"allocating and freeing the draft embedding llama_batch every speculative step may add overhead or disturb allocator/cache behavior; reusing one batch should reduce draft-step overhead","rollback_reason":"reusing draft_batch passed correctness but TPS fell to 10.04 and draft_ms rose to 147.8ms; batch allocation is not a meaningful bottleneck or the result is noise below best","next_action_hint":"avoid micro-optimizing host batch allocation; focus on reducing draft llama_decode time, exact validation calls, or proposal acceptance"}}
{"run":155,"commit":"de24439","metric":10.016276,"metrics":{"e2e_tps":1.293191,"spec_sec":24.745,"gen_tokens":32,"steps":14,"committed":33,"step_ms":228.2,"pack_ms":1.12,"draft_ms":147.99,"topk_ms":21.17,"exact_ms":57.89,"acceptance":2.357},"status":"discard","description":"skip root draft logits output — fewer logits rows but no speedup","timestamp":1777557465323,"segment":2,"confidence":9.858464879449215,"asi":{"hypothesis":"the draft root position logits are unused; disabling logits for batch position 0 should reduce one vocab row of output materialization and top-k sync work","rollback_reason":"correctness passed, but TPS fell to 10.02 and topk_ms increased to 21.17ms; outputting one fewer row does not reduce the dominant GPU synchronization cost","next_action_hint":"do not pursue small logits-row reductions; a real top-k improvement likely needs GPU-side argmax/top-k or eliminating the sync"}}
{"run":156,"commit":"de24439","metric":10.259041,"metrics":{"e2e_tps":1.295861,"spec_sec":24.694,"gen_tokens":32,"steps":14,"committed":33,"step_ms":222.8,"pack_ms":1.12,"draft_ms":142.96,"topk_ms":20.67,"exact_ms":58.01,"acceptance":2.357},"status":"discard","description":"capped chain seed depth 4 with K=4 — no gain over pure best-first","timestamp":1777557573722,"segment":2,"confidence":9.583840128587989,"asi":{"hypothesis":"a short greedy-chain seed may keep early top-1 continuity while leaving most of the budget for best-first branches, combining benefits of chain_seed and pure best-first","rollback_reason":"chain_seed cap 4 produced the same acceptance as pure best-first but slower step_ms and 10.26 TPS, below the 10.59 best","next_action_hint":"pure best-first remains better; if revisiting chain caps, test only if cross-prompt acceptance requires it"}}
{"run":157,"commit":"de24439","metric":9.168135,"metrics":{"e2e_tps":1.262178,"spec_sec":25.353,"gen_tokens":32,"steps":15,"committed":34,"step_ms":232.69,"pack_ms":1.13,"draft_ms":154.03,"topk_ms":21.51,"exact_ms":55.98,"acceptance":2.267},"status":"discard","description":"n_batch/n_ubatch 256 exact path — acceptance pattern worsens and TPS drops","timestamp":1777557665480,"segment":2,"confidence":8.417775842329654,"asi":{"hypothesis":"reducing n_batch/n_ubatch from 512 to 256 may reduce buffer pressure and improve decode latency without affecting outputs","rollback_reason":"n_batch 256 changed the speculative path to 15 steps with lower acceptance and slower draft/topk timing; TPS fell to 9.17","next_action_hint":"keep n_batch/n_ubatch at 512 for the benchmark; do not reduce batch sizing unless targeting memory footprint rather than TPS"}}
{"run":158,"commit":"de24439","metric":0,"metrics":{"e2e_tps":0,"spec_sec":0,"gen_tokens":0,"steps":0,"committed":0,"step_ms":0,"pack_ms":0,"draft_ms":0,"topk_ms":0,"exact_ms":0,"acceptance":0},"status":"crash","description":"n_batch/n_ubatch 1024 exact path — target context OOM","timestamp":1777557702217,"segment":2,"confidence":8.417775842329654,"asi":{"hypothesis":"increasing n_batch/n_ubatch to 1024 may improve graph efficiency or decode scheduling compared with 512","rollback_reason":"context initialization failed allocating a 3.6 GiB CUDA compute buffer for 1024-token graph reserve; no benchmark metrics produced","next_action_hint":"512 is the practical upper bound on the 24GB Castle GPU for this configuration; reset n_batch/n_ubatch to 512"}}
{"run":159,"commit":"de24439","metric":9.87179,"metrics":{"e2e_tps":1.293923,"spec_sec":24.731,"gen_tokens":32,"steps":14,"committed":33,"step_ms":231.54,"pack_ms":1.11,"draft_ms":151.29,"topk_ms":20.99,"exact_ms":58.12,"acceptance":2.357},"status":"discard","description":"reuse 1-token target validation batch — no host allocation speedup","timestamp":1777557879589,"segment":2,"confidence":8.8935689439265,"asi":{"hypothesis":"exact chain validation allocates a 1-token llama_batch for every validated node; reusing one batch may reduce per-step host overhead","rollback_reason":"reusable target batch passed correctness but TPS fell to 9.87 and exact_ms stayed ~58ms; target llama_decode dominates, not batch allocation","next_action_hint":"stop pursuing host allocation micro-optimizations; focus on reducing number of target validation decodes or draft GPU compute"}}
{"run":160,"commit":"de24439","metric":9.727272,"metrics":{"e2e_tps":1.286174,"spec_sec":24.88,"gen_tokens":32,"steps":14,"committed":33,"step_ms":234.98,"pack_ms":1.12,"draft_ms":155.15,"topk_ms":20.47,"exact_ms":58.22,"acceptance":2.357},"status":"discard","description":"pure best-first actual top_k=3 — same acceptance but slower timing","timestamp":1777557966866,"segment":2,"confidence":9.426370246854823,"asi":{"hypothesis":"K=3 may preserve the K=4 acceptance path while slightly reducing top-k work and tree branching","rollback_reason":"K=3 kept the 14-step acceptance pattern but draft/step timing was worse and TPS fell to 9.73; K=4 remains the best tested K above the K=2 acceptance cliff","next_action_hint":"keep top_k=4; K changes mostly affect acceptance, not the ~20ms sync-bound top-k cost"}}
{"run":161,"commit":"de24439","metric":9.447052,"metrics":{"e2e_tps":1.282977,"spec_sec":24.942,"gen_tokens":32,"steps":14,"committed":33,"step_ms":241.95,"pack_ms":1.13,"draft_ms":159.54,"topk_ms":22.89,"exact_ms":58.35,"acceptance":2.357},"status":"discard","description":"pure best-first K=4 with proposal_temp 1.0 — same acceptance but much slower","timestamp":1777558057033,"segment":2,"confidence":8.586538754323378,"asi":{"hypothesis":"proposal_temp 1.0 may diversify branch scores more than 0.7 and improve path quality","rollback_reason":"temp 1.0 did not improve acceptance and produced much slower draft/topk timing, dropping TPS to 9.45","next_action_hint":"keep proposal_temp 0.7; higher temperatures above 0.85 are not useful on this prompt"}}
{"run":162,"commit":"de24439","metric":10.433717,"metrics":{"e2e_tps":1.297806,"spec_sec":24.657,"gen_tokens":32,"steps":14,"committed":33,"step_ms":219.07,"pack_ms":1.13,"draft_ms":140.33,"topk_ms":19.74,"exact_ms":57.83,"acceptance":2.357},"status":"discard","description":"pure best-first K=4 with block_size 12 — close but below block_size 16 best","timestamp":1777558168159,"segment":2,"confidence":8.10071276246994,"asi":{"hypothesis":"block_size 12 may reduce draft work while retaining the same accepted path under pure best-first K=4","rollback_reason":"block_size 12 preserved acceptance and was faster than many noisy runs, but TPS 10.43 stayed below the kept 10.59 block_size 16 baseline","next_action_hint":"block_size 12 is close enough to revisit with repeats only if the 10.59 baseline proves noisy; for now keep default block_size 16"}}
{"run":163,"commit":"de24439","metric":9.508358,"metrics":{"e2e_tps":1.284934,"spec_sec":24.904,"gen_tokens":32,"steps":14,"committed":33,"step_ms":240.39,"pack_ms":1.12,"draft_ms":159.02,"topk_ms":22.22,"exact_ms":58,"acceptance":2.357},"status":"discard","description":"repeat current best defaults after sweep — noisy low baseline repeat","timestamp":1777558253018,"segment":2,"confidence":7.901161640801124,"asi":{"hypothesis":"repeat the current best defaults after restoring block_size 16 to measure noise and ensure the harness is back on the kept config","rollback_reason":"repeat preserved correctness and acceptance but landed in a slow-noise band at 9.51 TPS, below the 10.59 kept best","next_action_hint":"the metric is noisy with draft_ms spanning ~137-159ms; future keeps should be repeated, and block_size 12/0.85 temp close calls should not be kept without repeated wins"}}
{"run":164,"commit":"de24439","metric":10.074996,"metrics":{"e2e_tps":1.299123,"spec_sec":24.632,"gen_tokens":32,"steps":14,"committed":33,"step_ms":226.87,"pack_ms":1.11,"draft_ms":147.9,"topk_ms":20.15,"exact_ms":57.67,"acceptance":2.357},"status":"discard","description":"repeat current best defaults while refocusing on draft decode","timestamp":1777559045624,"segment":2,"confidence":8.10071276246994,"asi":{"hypothesis":"repeat current best pure best-first K=4 baseline to measure the current draft_ms noise band before changing draft decode","rollback_reason":"baseline repeat did not beat kept 10.59 TPS; no code/config change to keep","next_action_hint":"use draft_ms around 148ms as current reference; inspect full log for dflash_draft_ubatch_timing compute vs set_inputs breakdown"}}
{"run":165,"commit":"de24439","metric":9.79312,"metrics":{"e2e_tps":1.298122,"spec_sec":24.651,"gen_tokens":32,"steps":14,"committed":33,"step_ms":233.4,"pack_ms":1.13,"draft_ms":154,"topk_ms":20.15,"exact_ms":58.09,"acceptance":2.357},"status":"discard","description":"repeat block_size 12 draft decode optimization candidate","timestamp":1777559136372,"segment":2,"confidence":8.586538754323378,"asi":{"hypothesis":"block_size 12 may reduce draft graph work while preserving the same 14-step acceptance path, improving draft decode","rollback_reason":"repeat was slower: draft_ms 154ms and TPS 9.79, below kept block_size 16 best","next_action_hint":"do not keep block_size 12; draft compute does not scale down reliably with fewer block tokens"}}
{"run":166,"commit":"de24439","metric":8.835483,"metrics":{"e2e_tps":1.263923,"spec_sec":25.318,"gen_tokens":32,"steps":16,"committed":33,"step_ms":226.36,"pack_ms":0.82,"draft_ms":152.98,"topk_ms":21.5,"exact_ms":51.03,"acceptance":2.062},"status":"discard","description":"pure best-first K=4 with target_feat_ctx 96 to reduce draft FC/attention context","timestamp":1777559283158,"segment":2,"confidence":7.884112887310015,"asi":{"hypothesis":"shrinking target_feat_ctx from 128 to 96 should reduce draft target_feat FC/attention work enough to offset any small acceptance loss","rollback_reason":"ctx96 reduced pack only; draft_ms rose to 153ms and acceptance fell to 2.06, requiring 16 steps and lowering TPS to 8.84","next_action_hint":"target feature context below 128 damages proposal quality and does not reduce draft compute in practice; keep ctx128"}}
{"run":167,"commit":"de24439","metric":8.609927,"metrics":{"e2e_tps":1.273125,"spec_sec":25.135,"gen_tokens":32,"steps":16,"committed":33,"step_ms":232.29,"pack_ms":0.83,"draft_ms":158.47,"topk_ms":22.05,"exact_ms":50.9,"acceptance":2.062},"status":"discard","description":"F16 target_feat_raw draft input under accidental ctx96 — slower and lower acceptance","timestamp":1777559419904,"segment":2,"confidence":7.637615029697407,"asi":{"hypothesis":"using F16 target_feat_raw may reduce draft input bandwidth and FC compute cost","rollback_reason":"run used ctx96 left in autoresearch.sh from the previous config experiment, and F16 input was slower with lower acceptance; not a valid improvement","next_action_hint":"restore autoresearch.sh target_feat_ctx default to 128 before retrying any target_feat format experiment; be aware log_experiment preserves autoresearch.sh changes"}}
{"run":168,"commit":"de24439","metric":10.134408,"metrics":{"e2e_tps":1.297175,"spec_sec":24.669,"gen_tokens":32,"steps":14,"committed":33,"step_ms":225.54,"pack_ms":1.13,"draft_ms":146.19,"topk_ms":20.52,"exact_ms":57.67,"acceptance":2.357},"status":"discard","description":"F16 target_feat_raw draft input at ctx128","timestamp":1777559542537,"segment":2,"confidence":7.884112887310015,"asi":{"hypothesis":"using F16 target_feat_raw at the draft graph input should cut target_feat upload bandwidth and may speed the FC projection without changing exact correctness","rollback_reason":"correctness passed but TPS 10.13 stayed below kept 10.59 and draft_ms 146ms is within normal baseline noise; CPU F32→F16 conversion plus F16 input did not produce a real speedup","next_action_hint":"do not change target_feat_raw to F16 via host conversion; if revisiting, capture/pack directly in F16 or move compression to GPU to avoid CPU conversion cost"}}
{"run":169,"commit":"de24439","metric":6.018997,"metrics":{"e2e_tps":1.197694,"spec_sec":26.718,"gen_tokens":32,"steps":25,"committed":32,"step_ms":212.66,"pack_ms":1.04,"draft_ms":158.08,"topk_ms":21.8,"exact_ms":31.7,"acceptance":1.28},"status":"discard","description":"compute draft lm_head only for non-root output rows","timestamp":1777559673944,"segment":2,"confidence":7.637615029697407,"asi":{"hypothesis":"applying out_ids before the draft lm_head and omitting the unused root logits row should reduce draft output projection work","rollback_reason":"correctness passed but proposal alignment broke: acceptance fell to 1.28 and steps rose to 25; draft_ms also increased to 158ms","next_action_hint":"the dflash draft logits rows are position-sensitive; do not compact rows before lm_head without verifying row/order semantics against standalone DFlash"}}
{"run":170,"commit":"de24439","metric":10.329045,"metrics":{"e2e_tps":1.302613,"spec_sec":24.566,"gen_tokens":32,"steps":14,"committed":33,"step_ms":221.29,"pack_ms":1.11,"draft_ms":142,"topk_ms":20.07,"exact_ms":58.08,"acceptance":2.357},"status":"discard","description":"allow graph reuse for single-sequence DFlash draft ubatch view","timestamp":1777559819914,"segment":2,"confidence":7.711205551462638,"asi":{"hypothesis":"DFlash draft graph reuse may be blocked by non-owning single-sequence ubatch views; allowing reuse could enable CUDA graph replay and reduce draft decode overhead","rollback_reason":"profile still showed reused=0 on every draft ubatch, so this was not the reuse blocker; TPS 10.33 was below kept 10.59 and within noise","next_action_hint":"if pursuing graph reuse, instrument llm_graph_params::allow_reuse to print which field differs rather than guessing"}}
{"run":171,"commit":"de24439","metric":9.832298,"metrics":{"e2e_tps":1.296229,"spec_sec":24.687,"gen_tokens":32,"steps":14,"committed":33,"step_ms":232.47,"pack_ms":1.13,"draft_ms":152.38,"topk_ms":20.99,"exact_ms":57.94,"acceptance":2.357},"status":"discard","description":"force DFlash draft graph params reuse to test CUDA graph replay hypothesis","timestamp":1777559990277,"segment":2,"confidence":7.637615029697407,"asi":{"hypothesis":"if llm_graph_params comparison is the blocker, forcing DFlash draft allow_reuse true should make reused=1 and reduce draft compute overhead","rollback_reason":"profile still showed reused=0 for every draft ubatch, so the graph is reset before can_reuse matters, likely by the DFlash recurrent memory update path; TPS was worse","next_action_hint":"graph reuse optimization would need to avoid gf_res_prev reset in the memory update path or use a separate persistent draft graph; not a simple params comparison fix"}}
{"run":172,"commit":"de24439","metric":9.756752,"metrics":{"e2e_tps":1.294656,"spec_sec":24.717,"gen_tokens":32,"steps":14,"committed":33,"step_ms":234.27,"pack_ms":1.13,"draft_ms":154.23,"topk_ms":20.78,"exact_ms":58.09,"acceptance":2.357},"status":"discard","description":"skip gf_res_prev reset for DFlash draft memory update","timestamp":1777560098862,"segment":2,"confidence":7.565415826302686,"asi":{"hypothesis":"the DFlash recurrent memory update resets gf_res_prev before every draft decode; skipping that reset may permit graph/CUDA graph reuse and reduce draft decode overhead","rollback_reason":"profile still showed reused=0 for every draft step and TPS fell to 9.76; the reset was not the only blocker or reuse is disabled elsewhere","next_action_hint":"avoid more graph-reuse guesses; instrument process_ubatch/can_reuse if this path is revisited"}}
{"run":173,"commit":"de24439","metric":9.997438,"metrics":{"e2e_tps":1.300232,"spec_sec":24.611,"gen_tokens":32,"steps":14,"committed":33,"step_ms":228.63,"pack_ms":1.12,"draft_ms":149.42,"topk_ms":20.16,"exact_ms":57.9,"acceptance":2.357},"status":"discard","description":"draft context n_batch/n_ubatch 16 exactly matching DFlash block size","timestamp":1777560191687,"segment":2,"confidence":8.276184968104477,"asi":{"hypothesis":"setting draft n_batch/n_ubatch to the exact 16-token block size may reduce draft graph reserve/buffer overhead and improve draft decode","rollback_reason":"correctness passed but draft_ms 149ms and TPS 10.00 were below the kept best; smaller draft batch did not improve compute","next_action_hint":"keep draft n_batch cap at 64; batch-size memory savings are not a decode-speed path here"}}
{"run":174,"commit":"de24439","metric":10.155571,"metrics":{"e2e_tps":1.315249,"spec_sec":24.33,"gen_tokens":32,"steps":14,"committed":34,"step_ms":225.07,"pack_ms":1.12,"draft_ms":145.05,"topk_ms":19.36,"exact_ms":59.5,"acceptance":2.429},"status":"discard","description":"draft context n_ctx 256 to reduce draft buffer footprint","timestamp":1777560274649,"segment":2,"confidence":8.845601423014735,"asi":{"hypothesis":"shrinking draft n_ctx to 256 may reduce scheduler/compute-buffer overhead for the DFlash draft graph while retaining correctness","rollback_reason":"correctness passed and acceptance rose to 2.43, but TPS 10.16 stayed below kept 10.59; draft_ms 145ms is only within noise of baseline","next_action_hint":"draft n_ctx can be very small without breaking correctness, but it is not a clear speedup; if memory footprint matters, retest n_ctx=256 separately for server/fast-rollback fit"}}
{"run":175,"commit":"de24439","metric":10.232403,"metrics":{"e2e_tps":1.301766,"spec_sec":24.582,"gen_tokens":32,"steps":14,"committed":33,"step_ms":223.38,"pack_ms":1.16,"draft_ms":144.6,"topk_ms":19.79,"exact_ms":57.8,"acceptance":2.357},"status":"discard","description":"block_size 20 to test larger DFlash draft block without acceptance loss","timestamp":1777560437231,"segment":2,"confidence":8.45678600725038,"asi":{"hypothesis":"a larger draft block may expose more candidate positions for the same budget and improve acceptance without substantially increasing draft decode time","rollback_reason":"block_size 20 preserved the same 14-step acceptance pattern and did not beat the kept 10.59 TPS; no improvement despite draft_ms being in a good noise band","next_action_hint":"block size changes mostly do not affect draft_ms or acceptance on this prompt; reset block_size default and focus elsewhere"}}
{"run":176,"commit":"de24439","metric":10.295083,"metrics":{"e2e_tps":1.303144,"spec_sec":24.556,"gen_tokens":32,"steps":14,"committed":33,"step_ms":222.02,"pack_ms":1.11,"draft_ms":142.8,"topk_ms":20.29,"exact_ms":57.78,"acceptance":2.357},"status":"discard","description":"block_size 32 with budget 32 under pure best-first K=4","timestamp":1777560542708,"segment":2,"confidence":8.10071276246994,"asi":{"hypothesis":"larger DFlash block_size 32 may let the same budget explore deeper draft positions and improve acceptance while draft compute remains mostly fixed-overhead","rollback_reason":"acceptance stayed identical at 2.357 and TPS 10.30 remained below kept 10.59; larger block did not change the accepted path","next_action_hint":"block_size can vary widely without affecting this prompt's accepted path; no need for more block_size sweeps unless cross-prompt behavior differs"}}
{"run":177,"commit":"de24439","metric":9.860292,"metrics":{"e2e_tps":1.307831,"spec_sec":24.468,"gen_tokens":32,"steps":14,"committed":34,"step_ms":231.81,"pack_ms":1.09,"draft_ms":150.43,"topk_ms":20.62,"exact_ms":59.64,"acceptance":2.429},"status":"discard","description":"draft n_ctx 256 combined with block_size 32","timestamp":1777560663130,"segment":2,"confidence":8.45678600725038,"asi":{"hypothesis":"combining small draft n_ctx with a larger block may keep the higher 2.43 acceptance while reducing or amortizing draft decode overhead","rollback_reason":"acceptance stayed at 2.43 but draft_ms rose to 150ms and TPS fell to 9.86; the combination is worse than either isolated noisy candidate","next_action_hint":"do not combine n_ctx=256 and larger block_size for speed; reset block_size default before continuing"}}
{"run":178,"commit":"de24439","metric":6.159437,"metrics":{"e2e_tps":1.203188,"spec_sec":26.596,"gen_tokens":32,"steps":24,"committed":33,"step_ms":216.47,"pack_ms":1.03,"draft_ms":160.45,"topk_ms":21.04,"exact_ms":33.93,"acceptance":1.375},"status":"discard","description":"skip target K/V attention for last two DFlash draft layers","timestamp":1777560789901,"segment":2,"confidence":7.884112887310015,"asi":{"hypothesis":"dropping target-feature K/V matmuls in later draft layers may reduce draft compute while leaving enough proposal quality for exact validation","rollback_reason":"proposal quality collapsed: acceptance 1.38, 24 steps, and draft_ms increased to 160ms; the altered graph likely lost CUDA efficiency and model accuracy","next_action_hint":"do not prune target K/V attention inside trained DFlash layers; accuracy loss overwhelms any theoretical compute saving"}}
{"run":179,"commit":"de24439","metric":9.67981,"metrics":{"e2e_tps":1.291156,"spec_sec":24.784,"gen_tokens":32,"steps":15,"committed":32,"step_ms":220.39,"pack_ms":1.1,"draft_ms":146.41,"topk_ms":20.39,"exact_ms":52.45,"acceptance":2.133},"status":"discard","description":"use only last 32 target_feat positions for last two DFlash draft layers","timestamp":1777560915280,"segment":2,"confidence":8.155576462101964,"asi":{"hypothesis":"later DFlash draft layers may only need a short recent target-feature window; limiting them to 32 positions could reduce K/V matmul and attention compute with smaller accuracy loss than dropping target K/V entirely","rollback_reason":"draft_ms did not improve meaningfully and acceptance fell to 2.13, adding an extra step; TPS 9.68 is below baseline","next_action_hint":"per-layer target window truncation hurts proposal quality before it produces measurable draft speedup; avoid this approximation"}}
{"run":180,"commit":"de24439","metric":8.819071,"metrics":{"e2e_tps":1.275714,"spec_sec":25.084,"gen_tokens":32,"steps":15,"committed":33,"step_ms":241.9,"pack_ms":1.12,"draft_ms":162.84,"topk_ms":23.27,"exact_ms":54.64,"acceptance":2.2},"status":"discard","description":"use last 64 target_feat positions only for final DFlash draft layer","timestamp":1777561023846,"segment":2,"confidence":7.565415826302686,"asi":{"hypothesis":"truncating only the final draft layer to a 64-position target window may preserve most quality while reducing one layer's K/V and attention cost","rollback_reason":"acceptance dropped to 2.20 and draft_ms worsened to 163ms; the graph variant is both less accurate and slower","next_action_hint":"stop per-layer target window truncation experiments; the trained graph wants the full 128-window layout"}}
{"run":181,"commit":"de24439","metric":9.513107,"metrics":{"e2e_tps":1.266624,"spec_sec":25.264,"gen_tokens":32,"steps":14,"committed":33,"step_ms":240.27,"pack_ms":1.11,"draft_ms":158,"topk_ms":22.79,"exact_ms":58.34,"acceptance":2.357},"status":"discard","description":"disable flash_attn flag only for DFlash draft context","timestamp":1777561140520,"segment":2,"confidence":7.420714849615349,"asi":{"hypothesis":"for the small DFlash draft attention shape, disabling the draft context flash-attention flag might choose a lower-overhead kernel","rollback_reason":"draft_ms worsened to 158ms and TPS fell to 9.51; flash attention should stay enabled for the draft context","next_action_hint":"do not disable draft flash attention; attention kernel changes need a targeted custom kernel, not the generic non-flash path"}}
{"run":182,"commit":"de24439","metric":10.198618,"metrics":{"e2e_tps":1.300654,"spec_sec":24.603,"gen_tokens":32,"steps":14,"committed":33,"step_ms":224.12,"pack_ms":1.12,"draft_ms":145.1,"topk_ms":19.86,"exact_ms":58,"acceptance":2.357},"status":"discard","description":"disable detailed DFlash draft ubatch profiling logs in benchmark run","timestamp":1777561240315,"segment":2,"confidence":7.406063452435399,"asi":{"hypothesis":"LLAMA_DDTREE_PROFILE logging inside every draft llama_decode may add measurable draft overhead; disabling it should improve draft_ms without losing driver metrics","rollback_reason":"TPS 10.20 and draft_ms 145ms did not beat the kept 10.59 run; any logging overhead is smaller than run-to-run noise","next_action_hint":"profile logs are useful for draft diagnostics; keep the benchmark default profiling unless doing production latency measurements"}}
{"run":183,"commit":"de24439","metric":10.448024,"metrics":{"e2e_tps":1.300126,"spec_sec":24.613,"gen_tokens":32,"steps":14,"committed":33,"step_ms":218.77,"pack_ms":1.11,"draft_ms":139.63,"topk_ms":20.23,"exact_ms":57.77,"acceptance":2.357},"status":"discard","description":"truly omit LLAMA_DDTREE_PROFILE env to remove draft ubatch logging","timestamp":1777561340487,"segment":2,"confidence":7.5178283267860095,"asi":{"hypothesis":"the script was still setting an empty LLAMA_DDTREE_PROFILE env var, so profiling remained enabled; conditionally omitting it should reduce draft logging overhead","rollback_reason":"draft_ms improved to 139.6ms but TPS 10.45 still did not beat kept best 10.59; as a benchmark-control change it should not be kept unless it clearly improves primary","next_action_hint":"profile omission is a useful production-latency check; reset the benchmark to profiling-on for diagnostics or repeat no-profile if the target metric should exclude profiler overhead"}}
{"run":184,"commit":"de24439","metric":8.95165,"metrics":{"e2e_tps":1.280307,"spec_sec":24.994,"gen_tokens":32,"steps":17,"committed":33,"step_ms":210.28,"pack_ms":1.09,"draft_ms":146.42,"topk_ms":15.12,"exact_ms":47.62,"acceptance":1.941},"status":"discard","description":"skip logsumexp in top-k extraction using raw scaled logits","timestamp":1777561476120,"segment":2,"confidence":7.333912639063997,"asi":{"hypothesis":"removing per-vocab exp/logsumexp from draft top-k should cut topk_ms while exact validation preserves output correctness","rollback_reason":"topk_ms improved from about 20ms to 15ms, but proposal scoring changed enough to drop acceptance to 1.94 and require 17 steps; TPS fell to 8.95","next_action_hint":"try a cheaper row-relative score or approximate length penalty if pursuing top-k CPU cost, but raw logits alone distort tree ranking"}}
{"run":185,"commit":"de24439","metric":9.468352,"metrics":{"e2e_tps":1.284058,"spec_sec":24.921,"gen_tokens":32,"steps":16,"committed":33,"step_ms":211.23,"pack_ms":1.09,"draft_ms":144.33,"topk_ms":15.24,"exact_ms":50.53,"acceptance":2.062},"status":"discard","description":"top-k extraction with row-relative scores instead of logsumexp","timestamp":1777561568397,"segment":2,"confidence":7.124543556285478,"asi":{"hypothesis":"row-relative top-k scores may preserve within-row candidate ordering while removing expensive logsumexp, recovering acceptance compared with raw logits","rollback_reason":"topk_ms stayed low at 15.2ms but acceptance only recovered to 2.06, still requiring 16 steps; TPS 9.47 below best","next_action_hint":"the original logsumexp/row entropy term is important for tree depth ranking; any cheaper approximation needs an explicit depth/entropy penalty"}}
{"run":186,"commit":"de24439","metric":9.291117,"metrics":{"e2e_tps":1.260587,"spec_sec":25.385,"gen_tokens":32,"steps":15,"committed":34,"step_ms":229.61,"pack_ms":1.11,"draft_ms":156.59,"topk_ms":16.14,"exact_ms":55.73,"acceptance":2.267},"status":"discard","description":"top-k-only approximate logsumexp for proposal scores","timestamp":1777561666340,"segment":2,"confidence":6.926796845424066,"asi":{"hypothesis":"approximating logsumexp from only retained top-K logits may keep the row uncertainty penalty while avoiding full-vocab exp cost","rollback_reason":"topk_ms improved to 16ms, but acceptance dropped to 2.27 and draft_ms noise was high; TPS 9.29 is below best","next_action_hint":"top-K-only normalization is not close enough; the full-vocab tail mass affects DDTree depth ranking on this prompt"}}
{"run":187,"commit":"de24439","metric":10.548801,"metrics":{"e2e_tps":1.301766,"spec_sec":24.582,"gen_tokens":32,"steps":14,"committed":33,"step_ms":216.68,"pack_ms":1.12,"draft_ms":142.9,"topk_ms":14.74,"exact_ms":57.88,"acceptance":2.357},"status":"discard","description":"top-K-only normalization with larger budget 40 to recover acceptance","timestamp":1777561796176,"segment":2,"confidence":6.8563876878805425,"asi":{"hypothesis":"the cheap top-K-only logsumexp approximation saves about 5ms topk; increasing budget to 40 may recover the 14-step accepted path and beat the baseline","rollback_reason":"promising but TPS 10.55 was still just below kept 10.59, so not a primary improvement by the rules","next_action_hint":"repeat this exact combo (top-K-only normalization + budget 40) because it recovered acceptance and cut topk_ms to 14.7; a repeat may exceed the noisy best"}}