Skip to content

Commit 57da314

Browse files
committed
Bootstrap DFlash profit adaptive DM at max depth, converge via argmax
The profit controller's cold start walked the active draft depth up through low probe depths (0 -> 4 -> 8 -> max) before settling, so short responses spent their useful window at low depth and felt laggy. A prior attempt to "start high" instead reversed the walk to descend (max -> 8 -> 4) and let production rest at the walk's terminal depth; since cold-start probe measurements are uniform, that collapsed to the floor and could not climb back, halving decode throughput on high-acceptance workloads. Decouple the resting depth from the probe walk: - Cold start now holds production at the maximum draft depth once the no-spec baseline exists, instead of walking through low depths, so short requests run at max immediately. If the held max is measured clearly worse than no-spec, it falls through early so a better depth can take over. - The scheduler characterizes the lower probe spread through transient one-cycle excursions while production stays at max; the argmax candidate scorer then demotes only when a measured lower depth is genuinely faster (the safe, well-gated direction). The existing scoring, hysteresis, active-episode, baseline-reprobe, off-probe, and lower-rescue safeguards are unchanged, so the controller still converges to the true throughput optimum whether it is high, mid, or low. Add end-to-end convergence tests (high/mid/low optima) asserting the settled depth -- the property the earlier regression violated -- and update the cold-start/warmup tests to the hold-max mechanism.
1 parent 4caa0a4 commit 57da314

3 files changed

Lines changed: 108 additions & 34 deletions

File tree

tests/test-adaptive-dm.cpp

Lines changed: 74 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -422,10 +422,13 @@ int main() {
422422
state.apply_profit_recommendation(recommended);
423423
}
424424
state.observe_profit_timing(0, 0, 0, 0.0f, 30.0f, 0.0f, 30.0f);
425-
assert(state.decide_profit_n_max(8) == 4);
425+
// With a baseline available the cold controller bootstraps at the maximum
426+
// draft depth rather than walking up from a shallow probe.
427+
assert(state.decide_profit_n_max(8) == 8);
426428

427-
// test cold start: fresh profit controllers seed no-spec baseline before
428-
// any positive-depth DFlash cycle, then probe shallow depth first.
429+
// test cold start: fresh profit controllers seed the no-spec baseline before
430+
// any positive-depth DFlash cycle, then bootstrap production at the maximum
431+
// draft depth (the lower spread is characterized via transient excursions).
429432
state.reset_profit_state();
430433
state.dm_profit_min_samples = 2;
431434
state.dm_off_dwell = 1;
@@ -435,7 +438,7 @@ int main() {
435438
assert(state.decide_profit_n_max(8) == 0);
436439
state.observe_profit_timing(0, 0, 0, 0.0f, 32.0f, 0.0f, 32.0f);
437440
assert(state.profit_baseline_ready());
438-
assert(state.decide_profit_n_max(8) == 2);
441+
assert(state.decide_profit_n_max(8) == 8);
439442

440443
// Baseline scoring uses the same current EWMA policy as positive depths;
441444
// stale best no-spec spikes must not make baseline unbeatable.
@@ -447,17 +450,79 @@ int main() {
447450
assert(state.profit_baseline.best_score >= 39.9f);
448451
assert_close(state.profit_score_for_depth(0), 16.0f);
449452

450-
// test explicit warmup requires extra measured samples for the initial
451-
// positive-depth probe before moving to the next depth.
453+
// test explicit warmup requires extra measured samples per spread depth before
454+
// the initial probe set is treated as characterized; production holds at the
455+
// maximum draft depth throughout, and only argmax (post-characterization) moves.
452456
state.reset_profit_state();
453457
state.dm_profit_min_samples = 1;
454458
state.dm_profit_warmup = 2;
455459
state.observe_profit_timing(0, 0, 0, 0.0f, 30.0f, 0.0f, 30.0f);
456-
assert(state.decide_profit_n_max(8) == 2);
460+
assert(state.profit_baseline_ready());
461+
assert(state.decide_profit_n_max(8) == 8);
462+
// one sample per spread depth is not enough under warmup=2
457463
observe_profit_cycle(state, 2, 2, 1, 60.0f);
458-
assert(state.decide_profit_n_max(8) == 2);
464+
observe_profit_cycle(state, 4, 4, 1, 60.0f);
465+
observe_profit_cycle(state, 8, 8, 1, 60.0f);
466+
assert(!state.profit_initial_probe_set_ready(8));
467+
assert(state.decide_profit_n_max(8) == 8);
468+
// a second sample per spread depth completes characterization
459469
observe_profit_cycle(state, 2, 2, 1, 60.0f);
460-
assert(state.decide_profit_n_max(8) == 4);
470+
observe_profit_cycle(state, 4, 4, 1, 60.0f);
471+
observe_profit_cycle(state, 8, 8, 1, 60.0f);
472+
assert(state.profit_initial_probe_set_ready(8));
473+
state.dm_profit_warmup = 0;
474+
475+
// End-to-end convergence: a cold profit controller bootstraps at the maximum
476+
// draft depth and must then settle on the genuinely fastest depth, regardless
477+
// of whether the optimum is high, mid, or low. This is the property the earlier
478+
// "start high, walk down, rest at the walk terminal" regression violated: it
479+
// collapsed to the floor and could not climb back. Throughput here is modeled
480+
// unimodally (peak at `optimum`): score(d) = (1 + min(d, optimum)) / (20 + 2d),
481+
// with a no-spec baseline every positive depth beats.
482+
auto converged_depth = [](int base_n_max, int optimum) -> int {
483+
server_adaptive_dm_state s;
484+
s.dm_profit_min_samples = 2;
485+
s.dm_profit_baseline_interval = 0; // disable reprobe noise for a deterministic check
486+
auto feed = [&](int d) {
487+
if (d <= 0) {
488+
s.observe_profit_timing(0, 0, 0, 0.0f, 35.0f, 0.0f, 35.0f);
489+
return;
490+
}
491+
const int acc = d < optimum ? d : optimum;
492+
const float ms = 20.0f + 2.0f * (float) d;
493+
s.observe_profit_acceptance(d, acc);
494+
s.observe_profit_timing(d, d, acc, 0.0f, ms, 0.0f, ms);
495+
};
496+
feed(0);
497+
feed(0);
498+
assert(s.profit_baseline_ready());
499+
int rec = s.decide_profit_n_max(base_n_max);
500+
assert(rec == base_n_max); // cold start bootstraps at max, not a low probe
501+
s.apply_profit_recommendation(rec);
502+
// Each iteration samples production plus the full depth range, standing in
503+
// for transient characterization/exploration excursions accumulating over
504+
// time. The controller must converge to (and hold) the throughput optimum.
505+
for (int iter = 0; iter < 200; ++iter) {
506+
feed(s.adaptive_n_max > 0 ? s.adaptive_n_max : base_n_max);
507+
for (int d = 1; d <= base_n_max; ++d) {
508+
feed(d);
509+
}
510+
feed(0);
511+
rec = s.decide_profit_n_max(base_n_max);
512+
s.apply_profit_recommendation(rec);
513+
}
514+
return s.adaptive_n_max;
515+
};
516+
{
517+
const int high = converged_depth(15, 15);
518+
assert(high >= 12); // high optimum: stay high (the regressed case wanted this)
519+
const int mid8 = converged_depth(15, 8);
520+
assert(mid8 >= 6 && mid8 <= 10); // mid optimum identified, not stuck high or low
521+
const int mid12 = converged_depth(15, 12);
522+
assert(mid12 >= 10 && mid12 <= 14); // mid-high optimum identified
523+
const int low = converged_depth(15, 4);
524+
assert(low >= 2 && low <= 6); // low optimum identified, not stuck high
525+
}
461526

462527
// test reset_request_state preserves learned profit data while resetting
463528
// request-local counters

tools/server/server-adaptive-dm.h

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1298,29 +1298,30 @@ struct server_adaptive_dm_state {
12981298
? base_n_max
12991299
: std::clamp<int>(adaptive_n_max, 0, base_n_max));
13001300
const bool returning_from_baseline_probe = profit_baseline_probe_resume_n > 0;
1301-
const int unready_probe = profit_next_unready_probe_depth(base_n_max);
13021301
const bool collecting_initial_probe_set =
13031302
profit_baseline_probe_resume_n <= 0 &&
13041303
!profit_request_requires_fresh_switch_sample &&
13051304
!profit_initial_probe_set_ready(base_n_max);
1306-
const bool current_episode_ready_for_probe =
1307-
current_n > 0 &&
1308-
profit_active_episode_ready(current_n, base_n_max);
1309-
const float current_episode_score_for_probe =
1310-
current_episode_ready_for_probe ? profit_active_episode_score(current_n) : 0.0f;
1311-
const bool current_episode_clearly_bad =
1312-
current_episode_ready_for_probe &&
1313-
current_episode_score_for_probe < baseline_score * (1.0f + dm_profit_min);
1314-
const bool unready_probe_can_reduce_current =
1315-
current_n > 0 &&
1316-
unready_probe > 0 &&
1317-
unready_probe < current_n;
1318-
if (collecting_initial_probe_set &&
1319-
unready_probe > 0 &&
1320-
(!current_episode_clearly_bad || unready_probe_can_reduce_current)) {
1321-
profit_current_score = baseline_score;
1322-
profit_last_recommended_n = unready_probe;
1323-
return unready_probe;
1305+
// Cold-start bootstrap: once the no-spec baseline exists, run production at
1306+
// the maximum draft depth and let the scheduler characterize the lower probe
1307+
// spread through transient exploration excursions (see get_dflash_n_draft_max).
1308+
// Holding max keeps short requests fast instead of stranding them at a low
1309+
// probe depth, and the argmax candidate scorer below demotes only when a
1310+
// measured lower depth is genuinely faster. If the held positive depth is
1311+
// measured to be clearly worse than no-spec, fall through so a better
1312+
// characterized depth (or baseline) can take over without waiting for the
1313+
// full spread.
1314+
if (collecting_initial_probe_set) {
1315+
const int hold_n = current_n > 0 ? current_n : base_n_max;
1316+
const bool hold_episode_clearly_bad =
1317+
current_n > 0 &&
1318+
profit_active_episode_ready(current_n, base_n_max) &&
1319+
profit_active_episode_score(current_n) < baseline_score * (1.0f + dm_profit_min);
1320+
if (!hold_episode_clearly_bad) {
1321+
profit_current_score = baseline_score;
1322+
profit_last_recommended_n = hold_n;
1323+
return hold_n;
1324+
}
13241325
}
13251326

13261327
if (current_n == 0 && profit_baseline_probe_resume_n <= 0) {

tools/server/server-context.cpp

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1264,12 +1264,20 @@ struct server_slot : server_adaptive_dm_state {
12641264
if (advance_adaptive_probe) {
12651265
explore_counter++;
12661266
if (explore_counter % dm_explore_interval == 0) {
1267-
const int explore_n_max = profit_next_unready_explore_depth(
1268-
adaptive_n_max,
1269-
base_n_max,
1270-
explore_counter / dm_explore_interval);
1271-
if (explore_n_max > 0) {
1272-
n_draft_max = explore_n_max;
1267+
// While the cold-start probe spread is still uncharacterized,
1268+
// sample it through transient excursions so the profit argmax
1269+
// can compare low/mid/high depths; production stays at the held
1270+
// max between excursions. Afterwards, use steady local explore.
1271+
const int excursion_n_max =
1272+
(profit_baseline_ready() &&
1273+
!profit_initial_probe_set_ready(base_n_max))
1274+
? profit_next_unready_probe_depth(base_n_max)
1275+
: profit_next_unready_explore_depth(
1276+
adaptive_n_max,
1277+
base_n_max,
1278+
explore_counter / dm_explore_interval);
1279+
if (excursion_n_max > 0) {
1280+
n_draft_max = excursion_n_max;
12731281
}
12741282
}
12751283
}

0 commit comments

Comments
 (0)