Refine profit baseline reprobes

Anbeeld · Anbeeld · commit 3109a0bfa06c · 2026-05-13T18:32:30.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,7 +4,7 @@
 
 - Fixed the adaptive `profit` controller's no-spec baseline path. Profit mode now seeds baseline samples before positive-depth warmup, can shut DFlash fully off when the measured baseline wins, and no longer makes speculative decisions from draft-only telemetry.
 - Fixed a profit-controller bucket-transition deadlock where telemetry reset could clear the no-spec baseline while preserving a positive active draft depth, causing all later cycles to run as unrecorded single-token baseline and leaving DFlash permanently disabled.
-- Added periodic profit-controller baseline reprobes with `--spec-dm-profit-baseline-interval` / `LLAMA_ARG_SPEC_DM_PROFIT_BASELINE_INTERVAL` so long-context runs can refresh target-only timing as context grows. The default interval is 512 active speculative cycles, and periodic reprobes start only in longer context buckets; bucket transitions still seed a fresh baseline. Off-state probes now restart with the configured probe depth instead of jumping straight to full draft depth.
+- Added low-frequency profit-controller baseline reprobes with `--spec-dm-profit-baseline-interval` / `LLAMA_ARG_SPEC_DM_PROFIT_BASELINE_INTERVAL` so runs can refresh target-only timing as context grows. The default interval is 1024 active speculative cycles to keep probe overhead minimal; bucket transitions still seed a fresh baseline. Off-state probes now restart with the configured probe depth instead of jumping straight to full draft depth.
 - Stabilized profit depth selection on the production ladder (`0`-`8`, `10`, `12`, `14`, `16`, and the configured max) while preserving the previous active depth across baseline reprobes and avoiding off-probe counter starvation from repeated baseline cycles.
 - Hardened active-reasoning EOS handling. When an end-of-generation token appears while reasoning output is still active, the sampler now forces the reasoning-end sequence through the normal full-logits path; reduced DFlash verification rejects that case instead of accepting an unsafe reduced candidate set.
 - Hardened DFlash on split CUDA / multi-GPU placement. GPU cross-ring setup, hidden capture, CUDA graph capture, K/V projection cache updates, recurrent replay, conv replay, and async tensor get/set paths now check buffer/backend ownership and fall back to safer CPU or owning-buffer paths instead of reading or writing recurrent state through the wrong CUDA backend.
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -3843,7 +3843,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_spec().set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SPEC_DM_PROFIT_WARMUP"));
     add_opt(common_arg(
         {"--spec-dm-profit-baseline-interval"}, "N",
-        string_format("active profit-controller cycles between long-context no-spec baseline reprobes (default: %d, 0 = disabled)", params.speculative.dm_profit_baseline_interval),
+        string_format("active profit-controller cycles between no-spec baseline reprobes (default: %d, 0 = disabled)", params.speculative.dm_profit_baseline_interval),
         [](common_params & params, int value) {
             if (value < 0 || value > 4096) {
                 throw std::invalid_argument("spec-dm-profit-baseline-interval must be in [0, 4096]");
diff --git a/common/common.h b/common/common.h
@@ -402,7 +402,7 @@ struct common_params_speculative {
     float   dm_profit_ewma_alpha   = 0.15f;
     int32_t dm_profit_min_samples  = 3;
     int32_t dm_profit_warmup       = 0;     // positive-depth warmup cycles after baseline seeding (0 = auto from min_samples)
-    int32_t dm_profit_baseline_interval = 512; // active spec cycles between long-context no-spec baseline reprobes (0 = disabled)
+    int32_t dm_profit_baseline_interval = 1024; // active spec cycles between no-spec baseline reprobes (0 = disabled)
 
     // DFlash draft model (separate from upstream's draft.model)
     struct common_params_model mparams_dft;
diff --git a/docs/beellama-args.md b/docs/beellama-args.md
@@ -310,7 +310,7 @@ Adaptive Draft-Max is enabled by default for DFlash. It can reduce the active dr
 | `--spec-dm-profit-ewma-alpha F` | `0.15` | Smoothing factor for acceptance and timing running averages. |
 | `--spec-dm-profit-min-samples N` | `3` | Minimum observations per position/depth before scoring that depth as ready. |
 | `--spec-dm-profit-warmup N` | `0` | Positive-depth warmup cycles after the no-spec baseline is seeded (0 = use --spec-dm-profit-min-samples). |
-| `--spec-dm-profit-baseline-interval N` | `512` | Active speculative cycles between long-context no-spec baseline reprobes (0 = disabled). |
+| `--spec-dm-profit-baseline-interval N` | `1024` | Active speculative cycles between no-spec baseline reprobes (0 = disabled). |
 
 Use `profit` for normal serving. Use `fringe` when you want behavior tied more directly to observed draft acceptance near the active tail. Use `--no-spec-dm-adaptive` only when comparing fixed `--spec-draft-n-max` values or reproducing a narrow benchmark.
 
diff --git a/docs/beellama-features.md b/docs/beellama-features.md
@@ -128,7 +128,7 @@ This is not the same as public buun's checked DFlash adaptive tracking. Bee adds
 --spec-dm-profit-ewma-alpha 0.15
 --spec-dm-profit-min-samples 3
 --spec-dm-profit-warmup 0
---spec-dm-profit-baseline-interval 512
+--spec-dm-profit-baseline-interval 1024
 ```
 
 Use `--no-spec-dm-adaptive` when you need a fixed-depth benchmark. Otherwise, adaptive mode is the safer default for live serving because it can back away from weak drafts without changing the process command line.
diff --git a/tests/test-adaptive-dm.cpp b/tests/test-adaptive-dm.cpp
@@ -375,7 +375,7 @@ int main() {
         assert(reprobe.decide_profit_n_max(8) == 8);
     }
 
-    // test periodic baseline reprobes wait until longer context buckets
+    // test periodic baseline reprobes are interval-based, not context-bucket gated
     {
         server_adaptive_dm_state early;
         early.dm_profit_min_samples = 1;
@@ -392,7 +392,7 @@ int main() {
         early.observe_profit_timing(0, 0.0f, 40.0f, 0.0f, 40.0f);
         early.observe_profit_acceptance(8, 7);
         early.observe_profit_timing(8, 8.0f, 30.0f, 2.0f, 40.0f);
-        assert(!early.profit_should_probe_baseline());
+        assert(early.profit_should_probe_baseline());
     }
 
     return 0;
diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp
@@ -145,7 +145,7 @@ int main(void) {
     assert(params.speculative.dm_controller == COMMON_SPECULATIVE_DM_CONTROLLER_PROFIT);
     assert(params.speculative.dm_profit_min_samples == 3);
     assert(params.speculative.dm_profit_warmup == 0);
-    assert(params.speculative.dm_profit_baseline_interval == 512);
+    assert(params.speculative.dm_profit_baseline_interval == 1024);
 
     argv = {"binary_name", "--spec-draft-p-min", "0"};
     assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SERVER));
diff --git a/tools/server/server-adaptive-dm.h b/tools/server/server-adaptive-dm.h
@@ -11,7 +11,6 @@
 static constexpr int SERVER_ADAPTIVE_DM_PROFIT_POSITIONS  = 128;
 static constexpr int SERVER_ADAPTIVE_DM_PROFIT_DEPTHS     = SERVER_ADAPTIVE_DM_PROFIT_POSITIONS + 1;
 static constexpr int SERVER_ADAPTIVE_DM_PROFIT_CANDIDATES = SERVER_ADAPTIVE_DM_PROFIT_DEPTHS + 1;
-static constexpr int SERVER_ADAPTIVE_DM_PROFIT_BASELINE_REPROBE_MIN_BUCKET = 3;
 
 static inline int server_adaptive_dm_probe_n_max(int base_n_max, float probe_fraction) {
     if (base_n_max <= 0) {
@@ -278,7 +277,7 @@ struct server_adaptive_dm_state {
     float   dm_profit_ewma_alpha   = 0.15f;
     int32_t dm_profit_min_samples  = 3;
     int32_t dm_profit_warmup       = 0;
-    int32_t dm_profit_baseline_interval = 512;
+    int32_t dm_profit_baseline_interval = 1024;
 
     struct profit_depth_stats {
         int32_t samples = 0;
@@ -427,7 +426,6 @@ struct server_adaptive_dm_state {
             profit_baseline_ready() &&
             !profit_baseline_probe_pending &&
             adaptive_n_max > 0 &&
-            profit_key.context_bucket >= SERVER_ADAPTIVE_DM_PROFIT_BASELINE_REPROBE_MIN_BUCKET &&
             profit_cycles_since_baseline >= dm_profit_baseline_interval;
     }
 

Original file line number	Diff line number	Diff line change
`@@ -375,7 +375,7 @@ int main() {`
`375`	`375`	`assert(reprobe.decide_profit_n_max(8) == 8);`
`376`	`376`	`}`
`377`	`377`
`378`		`- // test periodic baseline reprobes wait until longer context buckets`
	`378`	`+ // test periodic baseline reprobes are interval-based, not context-bucket gated`
`379`	`379`	`{`
`380`	`380`	`server_adaptive_dm_state early;`
`381`	`381`	`early.dm_profit_min_samples = 1;`
`@@ -392,7 +392,7 @@ int main() {`
`392`	`392`	`early.observe_profit_timing(0, 0.0f, 40.0f, 0.0f, 40.0f);`
`393`	`393`	`early.observe_profit_acceptance(8, 7);`
`394`	`394`	`early.observe_profit_timing(8, 8.0f, 30.0f, 2.0f, 40.0f);`
`395`		`- assert(!early.profit_should_probe_baseline());`
	`395`	`+ assert(early.profit_should_probe_baseline());`
`396`	`396`	`}`
`397`	`397`
`398`	`398`	`return 0;`