Skip to content

Commit 3184de2

Browse files
author
Grok Compression
committed
Phase 4: working-wait — flat gate-chained DWT scheduling
Restructure DWT submission to eliminate sequence barriers: - All DWT batches (H and V, all resolutions) go into seq 0 - Ordering enforced purely by on_complete gates: T1 → hGate → H-DWT → (on_complete) → vGate → V-DWT → (on_complete) → hGate(next) → ... - Workers naturally process T1 jobs while DWT gates are closed - No sequence transition synchronization overhead between DWT steps - hGate[0]: T1 blocks at res 0+1; hGate[R>0]: T1(R+1) + 1 (V-DWT prev) - vGate[R]: signaled once by H-DWT batch on_complete Uses new scx-scheduling APIs: submit_full_batch, working_wait.
1 parent c1b76c0 commit 3184de2

1 file changed

Lines changed: 41 additions & 21 deletions

File tree

src/lib/core/scheduling/freebyrd/SchedulerFreebyrd.cpp

Lines changed: 41 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -557,14 +557,21 @@ bool SchedulerFreebyrd::decodeAndTransformScx(ITileProcessor* tileProcessor)
557557
}
558558

559559
// === Phase 2: Create gates for DWT and assign to T1 blocks ===
560-
// Gate for H-DWT at resolution R:
561-
// R==1: signaled by T1 blocks at res 0 AND res 1
562-
// R>1: signaled by T1 blocks at res R
563-
// (V-DWT dependency on lower-res result handled by sequence ordering in DWT domain)
560+
// Working-wait architecture: all DWT work in a single sequence (seq 0),
561+
// ordering enforced purely by gates and on_complete callbacks.
562+
//
563+
// Gate chain per component:
564+
// T1 blocks at res 0+1 → hGate[0] → H-DWT res 1 → (on_complete) → vGate[0] → V-DWT res 1
565+
// → (on_complete) → hGate[1] → H-DWT res 2 → (on_complete) → vGate[1] → V-DWT res 2 → ...
566+
//
567+
// hGate[0]: count = T1 blocks at res 0 + res 1 (signaled by T1 callbacks)
568+
// hGate[R>0]: count = T1 blocks at res R+1 + 1 (T1 callbacks + V-DWT prev on_complete)
569+
// vGate[R]: count = 1 (signaled by H-DWT batch on_complete)
564570

565571
struct CompGates
566572
{
567573
std::vector<uint32_t> hDwtGateIds; // hDwtGateIds[r-1] = gate for H-DWT at resolution r
574+
std::vector<uint32_t> vDwtGateIds; // vDwtGateIds[r-1] = gate for V-DWT at resolution r
568575
};
569576
std::vector<CompGates> compGates(numcomps_);
570577

@@ -587,14 +594,20 @@ bool SchedulerFreebyrd::decodeAndTransformScx(ITileProcessor* tileProcessor)
587594
// Create gates for each DWT resolution
588595
auto& gates = compGates[compno];
589596
gates.hDwtGateIds.resize(cw.numRes - 1);
597+
gates.vDwtGateIds.resize(cw.numRes - 1);
598+
590599
for(uint8_t r = 1; r < cw.numRes; ++r)
591600
{
592-
size_t gateCount;
601+
// H-DWT gate: signaled by T1 block completions + V-DWT(r-1) on_complete (for r>1)
602+
size_t hGateCount;
593603
if(r == 1)
594-
gateCount = blocksPerRes[0] + blocksPerRes[1]; // LL + all bands at res 1
604+
hGateCount = blocksPerRes[0] + blocksPerRes[1]; // LL + all bands at res 1
595605
else
596-
gateCount = blocksPerRes[r]; // bands at this res
597-
gates.hDwtGateIds[r - 1] = scx_engine_create_gate(engine, gateCount);
606+
hGateCount = blocksPerRes[r] + 1; // T1 blocks at res R + 1 signal from V-DWT(r-1)
607+
gates.hDwtGateIds[r - 1] = scx_engine_create_gate(engine, hGateCount);
608+
609+
// V-DWT gate: signaled once when H-DWT batch at this res completes
610+
gates.vDwtGateIds[r - 1] = scx_engine_create_gate(engine, 1);
598611
}
599612

600613
// Assign gate IDs to T1 blocks
@@ -683,9 +696,11 @@ bool SchedulerFreebyrd::decodeAndTransformScx(ITileProcessor* tileProcessor)
683696
&t1Contexts[compno]);
684697
}
685698

686-
// === Phase 4: Submit DWT work (DWT domain, gated sequences) ===
687-
// DWT domain sequences: seq 0 = H-DWT res 1 (gated), seq 1 = V-DWT res 1,
688-
// seq 2 = H-DWT res 2 (gated), seq 3 = V-DWT res 2, ...
699+
// === Phase 4: Submit DWT work (DWT domain, all in seq 0, gate-chained) ===
700+
// Working-wait model: all DWT batches in a single sequence.
701+
// Ordering enforced by gates + on_complete callbacks:
702+
// H-DWT(r): gated on hGate[r-1], on_complete signals vGate[r-1]
703+
// V-DWT(r): gated on vGate[r-1], on_complete signals hGate[r] (for r+1)
689704

690705
struct Dwt53HCtx
691706
{
@@ -776,8 +791,6 @@ bool SchedulerFreebyrd::decodeAndTransformScx(ITileProcessor* tileProcessor)
776791
wav->horizPool_ = std::make_unique<dwt_scratch<int32_t>[]>(num_threads);
777792
wav->vertPool_ = std::make_unique<dwt_scratch<int32_t>[]>(num_threads);
778793

779-
size_t dwtSeqIdx = 0;
780-
781794
for(uint8_t res = 1; res < cw.numRes; ++res)
782795
{
783796
wav->horiz_.sn = bandLL->width();
@@ -809,8 +822,13 @@ bool SchedulerFreebyrd::decodeAndTransformScx(ITileProcessor* tileProcessor)
809822
wav->vertPool_[i].mem = (int32_t*)waveletPoolData_->getVert(i);
810823
}
811824

812-
// H-DWT stripes (GATED on prerequisite T1 blocks)
825+
// H-DWT stripes: gated on hGate, on_complete signals vGate
813826
uint32_t hDwtGate = gates.hDwtGateIds[res - 1];
827+
uint32_t vDwtGate = gates.vDwtGateIds[res - 1];
828+
// V-DWT on_complete signals next H-DWT gate (if not last resolution)
829+
uint32_t vOnComplete =
830+
(res < cw.numRes - 1) ? gates.hDwtGateIds[res] : SCX_NO_GATE;
831+
814832
{
815833
Dwt53HCtx hctx;
816834
hctx.wavelet = wav;
@@ -849,13 +867,13 @@ bool SchedulerFreebyrd::decodeAndTransformScx(ITileProcessor* tileProcessor)
849867
{
850868
ds.h53.push_back(std::move(hctx));
851869
auto& ctx = ds.h53.back();
852-
scx_engine_submit_gated_batch(engine, cw.dwtDomainId, dwtSeqIdx, ctx.stripes.size(),
853-
Dwt53HCtx::execute, &ctx, hDwtGate);
854-
dwtSeqIdx++;
870+
// H-DWT: gated on T1 completion (+ V-DWT prev), on_complete signals V-DWT gate
871+
scx_engine_submit_full_batch(engine, cw.dwtDomainId, 0, ctx.stripes.size(),
872+
Dwt53HCtx::execute, &ctx, hDwtGate, vDwtGate);
855873
}
856874
}
857875

858-
// V-DWT stripes (sequence ordering ensures H-DWT done first)
876+
// V-DWT stripes: gated on V-DWT gate (signaled by H-DWT on_complete)
859877
{
860878
DcShiftParam dcShift = (res == cw.numRes - 1) ? wav->dcShift_ : DcShiftParam{};
861879
auto winL = tileBuffer->getResWindowBufferSplitSimple(res, SPLIT_L);
@@ -880,15 +898,17 @@ bool SchedulerFreebyrd::decodeAndTransformScx(ITileProcessor* tileProcessor)
880898
{
881899
ds.v53.push_back(std::move(vctx));
882900
auto& ctx = ds.v53.back();
883-
scx_engine_submit_batch(engine, cw.dwtDomainId, dwtSeqIdx, ctx.stripes.size(),
884-
Dwt53VCtx::execute, &ctx);
885-
dwtSeqIdx++;
901+
// V-DWT: gated on H-DWT completion, on_complete signals next H-DWT gate
902+
scx_engine_submit_full_batch(engine, cw.dwtDomainId, 0, ctx.stripes.size(),
903+
Dwt53VCtx::execute, &ctx, vDwtGate, vOnComplete);
886904
}
887905
}
888906
}
889907
}
890908

891909
// === Phase 5: Run engine (T1 + gated DWT interleave naturally) ===
910+
// Working-wait: workers process T1 jobs while DWT gates are closed,
911+
// then immediately start DWT work as gates open — no sequence barriers.
892912
scx_engine_run(engine);
893913

894914
if(!success_)

0 commit comments

Comments
 (0)