Skip to content

Commit 79a89e0

Browse files
vast H200 evidence for PR-R1e (write-path expansion): capacity is NOT the bottleneck
Three write-path expansions on the small-vocab task (aux=0.1, 2000 steps, sequential on one H200 to avoid the multi-bridge OOM): variant localization recall alpha (FFN 4x write path) 0.375 0.14 beta (3 bridges @ 8/14/20) 0.125 0.10 gamma (full pre-norm block) 0.819 0.12 (R1d reference, no expansion) 0.50 0.10-0.16 gamma drives localization to 0.82 -- clearing the decision-matrix 0.80 'high' bar -- with a healthy, decreasing aux loss (1.35 -> 0.20; no capture/init bug). Yet recall stays ~0.12 across ALL three expansions, identical to R1d with no expansion at all. Verdict: the R1d 'write-capacity bottleneck' hypothesis is refuted. Even a full transformer-block bridge that locates the needle 82% of the time cannot project it into a decodable residual. The proposer's last-layer hidden at the needle position does not carry the answer in a form a cross-attention bridge can transcribe. This is strong evidence against ADR 0011 section 3's single-bridge coupling -> ADR 0010 (PR #66). Note on convergence: alpha (FFN) localizes slower than R1d (0.375 vs 0.50) because the FFN write-path gradients inflate the global clip_grad_norm and attenuate the q/k attention gradients; gamma's LayerNorms counteract this and localize best (0.82). No code bug; the aux mechanism works in every variant where its target bridge is supervised (alpha, gamma). Co-authored-by: FluffyAIcode <FluffyAIcode@users.noreply.github.com>
1 parent 1831b5e commit 79a89e0

6 files changed

Lines changed: 1502 additions & 0 deletions
Lines changed: 259 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,259 @@
1+
{
2+
"schema_version": 5,
3+
"kind": "adr_0011_toy_prototype_g_x1",
4+
"config": {
5+
"model": "google/gemma-3-1b-it",
6+
"device": "cuda",
7+
"attn_implementation": "eager",
8+
"cross_attn_depth": 20,
9+
"cross_attn_depths": [
10+
20
11+
],
12+
"sink": 4,
13+
"window": 64,
14+
"num_heads": 16,
15+
"head_dim": 128,
16+
"train_steps": 2000,
17+
"lr": 0.0003,
18+
"o_proj_init_std": 0.01,
19+
"bridge_use_ffn_write_path": true,
20+
"bridge_use_block_architecture": false,
21+
"ffn_expansion": 4,
22+
"n_trainable_params": 20056320,
23+
"retrieval_aux_weight": 0.1,
24+
"needle_debug_mode": "small",
25+
"needle_vocab_size": 20,
26+
"n_train": 200,
27+
"n_eval": 50,
28+
"haystack_min_tokens": 256,
29+
"haystack_max_tokens": 1024,
30+
"seed": 42,
31+
"uses_chat_template": true,
32+
"verifier_layer_surgery": "forward_hook_on_layer_K_output"
33+
},
34+
"pre_train": {
35+
"cross_attn_recall": 0.0,
36+
"baseline_recall": 0.0,
37+
"oracle_recall": 1.0,
38+
"localization_rate": 0.135,
39+
"mass_on_needle": 0.13375000000000087,
40+
"needle_found_rate": 1.0
41+
},
42+
"training_history": [
43+
{
44+
"step": 100,
45+
"cross_attn_recall": 0.1,
46+
"baseline_recall": 0.0,
47+
"ce_loss_avg10": 0.816216093301773,
48+
"aux_loss_avg10": 1.2515625,
49+
"localization_rate": 0.290625,
50+
"mass_on_needle": 0.290625,
51+
"needle_found_rate": 1.0
52+
},
53+
{
54+
"step": 200,
55+
"cross_attn_recall": 0.25,
56+
"baseline_recall": 0.0,
57+
"ce_loss_avg10": 0.8468579649925232,
58+
"aux_loss_avg10": 1.26640625,
59+
"localization_rate": 0.28125,
60+
"mass_on_needle": 0.28125,
61+
"needle_found_rate": 1.0
62+
},
63+
{
64+
"step": 300,
65+
"cross_attn_recall": 0.0,
66+
"baseline_recall": 0.0,
67+
"ce_loss_avg10": 0.9260371834039688,
68+
"aux_loss_avg10": 1.13984375,
69+
"localization_rate": 0.28125,
70+
"mass_on_needle": 0.28125,
71+
"needle_found_rate": 1.0
72+
},
73+
{
74+
"step": 400,
75+
"cross_attn_recall": 0.05,
76+
"baseline_recall": 0.0,
77+
"ce_loss_avg10": 0.7788902819156647,
78+
"aux_loss_avg10": 1.05390625,
79+
"localization_rate": 0.34375,
80+
"mass_on_needle": 0.34375,
81+
"needle_found_rate": 1.0
82+
},
83+
{
84+
"step": 500,
85+
"cross_attn_recall": 0.1,
86+
"baseline_recall": 0.0,
87+
"ce_loss_avg10": 0.7642070293426514,
88+
"aux_loss_avg10": 1.15703125,
89+
"localization_rate": 0.3125,
90+
"mass_on_needle": 0.3125,
91+
"needle_found_rate": 1.0
92+
},
93+
{
94+
"step": 600,
95+
"cross_attn_recall": 0.2,
96+
"baseline_recall": 0.0,
97+
"ce_loss_avg10": 0.7696174263954163,
98+
"aux_loss_avg10": 1.1640625,
99+
"localization_rate": 0.3125,
100+
"mass_on_needle": 0.3125,
101+
"needle_found_rate": 1.0
102+
},
103+
{
104+
"step": 700,
105+
"cross_attn_recall": 0.1,
106+
"baseline_recall": 0.0,
107+
"ce_loss_avg10": 0.6628526449203491,
108+
"aux_loss_avg10": 1.145703125,
109+
"localization_rate": 0.321875,
110+
"mass_on_needle": 0.3203125,
111+
"needle_found_rate": 1.0
112+
},
113+
{
114+
"step": 800,
115+
"cross_attn_recall": 0.1,
116+
"baseline_recall": 0.0,
117+
"ce_loss_avg10": 0.8468569949269295,
118+
"aux_loss_avg10": 1.1390625,
119+
"localization_rate": 0.31875,
120+
"mass_on_needle": 0.3171875,
121+
"needle_found_rate": 1.0
122+
},
123+
{
124+
"step": 900,
125+
"cross_attn_recall": 0.1,
126+
"baseline_recall": 0.0,
127+
"ce_loss_avg10": 0.7523266643285751,
128+
"aux_loss_avg10": 1.0953125,
129+
"localization_rate": 0.31875,
130+
"mass_on_needle": 0.31875,
131+
"needle_found_rate": 1.0
132+
},
133+
{
134+
"step": 1000,
135+
"cross_attn_recall": 0.05,
136+
"baseline_recall": 0.0,
137+
"ce_loss_avg10": 0.7162347197532654,
138+
"aux_loss_avg10": 1.1640625,
139+
"localization_rate": 0.31875,
140+
"mass_on_needle": 0.31875,
141+
"needle_found_rate": 1.0
142+
},
143+
{
144+
"step": 1100,
145+
"cross_attn_recall": 0.15,
146+
"baseline_recall": 0.0,
147+
"ce_loss_avg10": 0.7101544559001922,
148+
"aux_loss_avg10": 1.1640625,
149+
"localization_rate": 0.3125,
150+
"mass_on_needle": 0.3125,
151+
"needle_found_rate": 1.0
152+
},
153+
{
154+
"step": 1200,
155+
"cross_attn_recall": 0.2,
156+
"baseline_recall": 0.0,
157+
"ce_loss_avg10": 0.7298780381679535,
158+
"aux_loss_avg10": 1.1234375,
159+
"localization_rate": 0.3125,
160+
"mass_on_needle": 0.3125,
161+
"needle_found_rate": 1.0
162+
},
163+
{
164+
"step": 1300,
165+
"cross_attn_recall": 0.05,
166+
"baseline_recall": 0.0,
167+
"ce_loss_avg10": 0.6709768295288085,
168+
"aux_loss_avg10": 1.1375,
169+
"localization_rate": 0.3125,
170+
"mass_on_needle": 0.3125,
171+
"needle_found_rate": 1.0
172+
},
173+
{
174+
"step": 1400,
175+
"cross_attn_recall": 0.1,
176+
"baseline_recall": 0.0,
177+
"ce_loss_avg10": 0.9122021347284317,
178+
"aux_loss_avg10": 1.15390625,
179+
"localization_rate": 0.3125,
180+
"mass_on_needle": 0.3125,
181+
"needle_found_rate": 1.0
182+
},
183+
{
184+
"step": 1500,
185+
"cross_attn_recall": 0.15,
186+
"baseline_recall": 0.0,
187+
"ce_loss_avg10": 0.43685243688523767,
188+
"aux_loss_avg10": 1.07109375,
189+
"localization_rate": 0.3625,
190+
"mass_on_needle": 0.3625,
191+
"needle_found_rate": 1.0
192+
},
193+
{
194+
"step": 1600,
195+
"cross_attn_recall": 0.1,
196+
"baseline_recall": 0.0,
197+
"ce_loss_avg10": 0.739033830165863,
198+
"aux_loss_avg10": 1.051953125,
199+
"localization_rate": 0.346875,
200+
"mass_on_needle": 0.346875,
201+
"needle_found_rate": 1.0
202+
},
203+
{
204+
"step": 1700,
205+
"cross_attn_recall": 0.2,
206+
"baseline_recall": 0.0,
207+
"ce_loss_avg10": 0.895532414317131,
208+
"aux_loss_avg10": 1.04609375,
209+
"localization_rate": 0.375,
210+
"mass_on_needle": 0.375,
211+
"needle_found_rate": 1.0
212+
},
213+
{
214+
"step": 1800,
215+
"cross_attn_recall": 0.1,
216+
"baseline_recall": 0.0,
217+
"ce_loss_avg10": 0.823189201951027,
218+
"aux_loss_avg10": 1.051171875,
219+
"localization_rate": 0.375,
220+
"mass_on_needle": 0.375,
221+
"needle_found_rate": 1.0
222+
},
223+
{
224+
"step": 1900,
225+
"cross_attn_recall": 0.25,
226+
"baseline_recall": 0.0,
227+
"ce_loss_avg10": 0.8548207342624664,
228+
"aux_loss_avg10": 0.995703125,
229+
"localization_rate": 0.375,
230+
"mass_on_needle": 0.375,
231+
"needle_found_rate": 1.0
232+
},
233+
{
234+
"step": 2000,
235+
"cross_attn_recall": 0.15,
236+
"baseline_recall": 0.0,
237+
"ce_loss_avg10": 0.7280006021261215,
238+
"aux_loss_avg10": 0.98046875,
239+
"localization_rate": 0.375,
240+
"mass_on_needle": 0.375,
241+
"needle_found_rate": 1.0
242+
}
243+
],
244+
"final": {
245+
"cross_attn_recall": 0.14,
246+
"baseline_recall": 0.0,
247+
"oracle_recall": 1.0,
248+
"localization_rate": 0.375,
249+
"mass_on_needle": 0.375,
250+
"needle_found_rate": 1.0,
251+
"elapsed_s": 732.4580546410289
252+
},
253+
"gate_predicates": {
254+
"oracle_ge_080": true,
255+
"bounded_le_030": true,
256+
"cross_attn_ge_080": false
257+
},
258+
"gate_g_x1_pass": false
259+
}

0 commit comments

Comments
 (0)