|
195 | 195 | {"train/step": 2250, "train/loss": 0.3139530122280121, "train/spikes_per_inference": 93.53617265489366, "train/router_alpha_mean": 0.25, "train/block_0_spike_rate_q": 0.27248942852020264, "train/block_0_spike_rate_v": 0.24066263437271118, "train/block_0_q_min": -763.4625854492188, "train/block_0_alpha_mean": 0.25, "train/block_1_spike_rate_q": 0.32022908329963684, "train/block_1_spike_rate_v": 0.3113449215888977, "train/block_1_q_min": -3.1354899406433105, "train/block_1_alpha_mean": 0.2499999850988388, "train/spikes_total_raw": 1103397590.0, "train/max_attn": 31.466251373291016} |
196 | 196 | {"train/step": 2500, "train/loss": 0.3689018487930298, "train/spikes_per_inference": 163.10530870225693, "train/router_alpha_mean": 0.25, "train/block_0_spike_rate_q": 0.27536672353744507, "train/block_0_spike_rate_v": 0.24373474717140198, "train/block_0_q_min": -786.108642578125, "train/block_0_alpha_mean": 0.25, "train/block_1_spike_rate_q": 0.3216303586959839, "train/block_1_spike_rate_v": 0.3183247745037079, "train/block_1_q_min": -3.3191134929656982, "train/block_1_alpha_mean": 0.2499999850988388, "train/spikes_total_raw": 1924068512.0, "train/max_attn": 33.94112777709961} |
197 | 197 | {"train/step": 2750, "train/loss": 0.3947153389453888, "train/spikes_per_inference": 233.57178302341038, "train/router_alpha_mean": 0.25, "train/block_0_spike_rate_q": 0.2763076722621918, "train/block_0_spike_rate_v": 0.24003244936466217, "train/block_0_q_min": -782.0375366210938, "train/block_0_alpha_mean": 0.25, "train/block_1_spike_rate_q": 0.34148916602134705, "train/block_1_spike_rate_v": 0.30428263545036316, "train/block_1_q_min": -2.8819613456726074, "train/block_1_alpha_mean": 0.25, "train/spikes_total_raw": 2755324867.0, "train/max_attn": 35.35533905029297} |
| 198 | +{"epoch": 40, "val/mean_return": 147.39, "val/std_return": 34.18739387552084, "val/spikes_per_inference": 4282.4074490017365} |
| 199 | +{"train/step": 3000, "train/loss": 0.3806813061237335, "train/spikes_per_inference": 127.47803081936307, "train/router_alpha_mean": 0.2500000298023224, "train/block_0_spike_rate_q": 0.2823430299758911, "train/block_0_spike_rate_v": 0.24599608778953552, "train/block_0_q_min": -784.9763793945312, "train/block_0_alpha_mean": 0.2500000298023224, "train/block_1_spike_rate_q": 0.3445475399494171, "train/block_1_spike_rate_v": 0.31401926279067993, "train/block_1_q_min": -2.983755350112915, "train/block_1_alpha_mean": 0.2499999850988388, "train/spikes_total_raw": 1503792041.0, "train/max_attn": 36.769554138183594} |
| 200 | +{"train/step": 3250, "train/loss": 0.348725289106369, "train/spikes_per_inference": 199.1956642150879, "train/router_alpha_mean": 0.2499999850988388, "train/block_0_spike_rate_q": 0.28220927715301514, "train/block_0_spike_rate_v": 0.24271240830421448, "train/block_0_q_min": -752.0289916992188, "train/block_0_alpha_mean": 0.2499999850988388, "train/block_1_spike_rate_q": 0.33830514550209045, "train/block_1_spike_rate_v": 0.31850025057792664, "train/block_1_q_min": -3.0039777755737305, "train/block_1_alpha_mean": 0.25, "train/spikes_total_raw": 2349807669.0, "train/max_attn": 36.06244659423828} |
| 201 | +{"epoch": 50, "val/mean_return": 78.95, "val/std_return": 55.71110751008276, "val/spikes_per_inference": 2316.244070095486} |
| 202 | +{"train/step": 3500, "train/loss": 0.4545723795890808, "train/spikes_per_inference": 45.78157450358073, "train/router_alpha_mean": 0.2499999850988388, "train/block_0_spike_rate_q": 0.2875020205974579, "train/block_0_spike_rate_v": 0.23727265000343323, "train/block_0_q_min": -796.530029296875, "train/block_0_alpha_mean": 0.2499999850988388, "train/block_1_spike_rate_q": 0.33224740624427795, "train/block_1_spike_rate_v": 0.3152882754802704, "train/block_1_q_min": -3.263282060623169, "train/block_1_alpha_mean": 0.25, "train/spikes_total_raw": 540061428.0, "train/max_attn": 36.06244659423828} |
| 203 | +{"train/param_count": 115911} |
| 204 | +{"train/step": 3750, "train/loss": 0.41970255970954895, "train/spikes_per_inference": 117.21909806993273, "train/router_alpha_mean": 0.25, "train/block_0_spike_rate_q": 0.2938033938407898, "train/block_0_spike_rate_v": 0.2422531098127365, "train/block_0_q_min": -799.7066650390625, "train/block_0_alpha_mean": 0.25, "train/block_1_spike_rate_q": 0.31651660799980164, "train/block_1_spike_rate_v": 0.3159032166004181, "train/block_1_q_min": -3.068474292755127, "train/block_1_alpha_mean": 0.25, "train/spikes_total_raw": 1382772746.0, "train/max_attn": 36.06244659423828} |
0 commit comments