Phase 4: working-wait — flat gate-chained DWT scheduling

Grok Compression · Grok Compression · commit 3184de2dc93d · 2026-05-17T08:11:46.000-04:00
Restructure DWT submission to eliminate sequence barriers:
- All DWT batches (H and V, all resolutions) go into seq 0
- Ordering enforced purely by on_complete gates:
  T1 → hGate → H-DWT → (on_complete) → vGate → V-DWT → (on_complete) → hGate(next) → ...
- Workers naturally process T1 jobs while DWT gates are closed
- No sequence transition synchronization overhead between DWT steps
- hGate[0]: T1 blocks at res 0+1; hGate[R&gt;0]: T1(R+1) + 1 (V-DWT prev)
- vGate[R]: signaled once by H-DWT batch on_complete

Uses new scx-scheduling APIs: submit_full_batch, working_wait.
diff --git a/src/lib/core/scheduling/freebyrd/SchedulerFreebyrd.cpp b/src/lib/core/scheduling/freebyrd/SchedulerFreebyrd.cpp
@@ -557,14 +557,21 @@ bool SchedulerFreebyrd::decodeAndTransformScx(ITileProcessor* tileProcessor)
   }
 
   // === Phase 2: Create gates for DWT and assign to T1 blocks ===
-  // Gate for H-DWT at resolution R:
-  //   R==1: signaled by T1 blocks at res 0 AND res 1
-  //   R>1:  signaled by T1 blocks at res R
-  //   (V-DWT dependency on lower-res result handled by sequence ordering in DWT domain)
+  // Working-wait architecture: all DWT work in a single sequence (seq 0),
+  // ordering enforced purely by gates and on_complete callbacks.
+  //
+  // Gate chain per component:
+  //   T1 blocks at res 0+1 → hGate[0] → H-DWT res 1 → (on_complete) → vGate[0] → V-DWT res 1
+  //     → (on_complete) → hGate[1] → H-DWT res 2 → (on_complete) → vGate[1] → V-DWT res 2 → ...
+  //
+  // hGate[0]: count = T1 blocks at res 0 + res 1 (signaled by T1 callbacks)
+  // hGate[R>0]: count = T1 blocks at res R+1 + 1 (T1 callbacks + V-DWT prev on_complete)
+  // vGate[R]: count = 1 (signaled by H-DWT batch on_complete)
 
   struct CompGates
   {
     std::vector<uint32_t> hDwtGateIds; // hDwtGateIds[r-1] = gate for H-DWT at resolution r
+    std::vector<uint32_t> vDwtGateIds; // vDwtGateIds[r-1] = gate for V-DWT at resolution r
   };
   std::vector<CompGates> compGates(numcomps_);
 
@@ -587,14 +594,20 @@ bool SchedulerFreebyrd::decodeAndTransformScx(ITileProcessor* tileProcessor)
     // Create gates for each DWT resolution
     auto& gates = compGates[compno];
     gates.hDwtGateIds.resize(cw.numRes - 1);
+    gates.vDwtGateIds.resize(cw.numRes - 1);
+
     for(uint8_t r = 1; r < cw.numRes; ++r)
     {
-      size_t gateCount;
+      // H-DWT gate: signaled by T1 block completions + V-DWT(r-1) on_complete (for r>1)
+      size_t hGateCount;
       if(r == 1)
-        gateCount = blocksPerRes[0] + blocksPerRes[1]; // LL + all bands at res 1
+        hGateCount = blocksPerRes[0] + blocksPerRes[1]; // LL + all bands at res 1
       else
-        gateCount = blocksPerRes[r]; // bands at this res
-      gates.hDwtGateIds[r - 1] = scx_engine_create_gate(engine, gateCount);
+        hGateCount = blocksPerRes[r] + 1; // T1 blocks at res R + 1 signal from V-DWT(r-1)
+      gates.hDwtGateIds[r - 1] = scx_engine_create_gate(engine, hGateCount);
+
+      // V-DWT gate: signaled once when H-DWT batch at this res completes
+      gates.vDwtGateIds[r - 1] = scx_engine_create_gate(engine, 1);
     }
 
     // Assign gate IDs to T1 blocks
@@ -683,9 +696,11 @@ bool SchedulerFreebyrd::decodeAndTransformScx(ITileProcessor* tileProcessor)
                             &t1Contexts[compno]);
   }
 
-  // === Phase 4: Submit DWT work (DWT domain, gated sequences) ===
-  // DWT domain sequences: seq 0 = H-DWT res 1 (gated), seq 1 = V-DWT res 1,
-  //                       seq 2 = H-DWT res 2 (gated), seq 3 = V-DWT res 2, ...
+  // === Phase 4: Submit DWT work (DWT domain, all in seq 0, gate-chained) ===
+  // Working-wait model: all DWT batches in a single sequence.
+  // Ordering enforced by gates + on_complete callbacks:
+  //   H-DWT(r): gated on hGate[r-1], on_complete signals vGate[r-1]
+  //   V-DWT(r): gated on vGate[r-1], on_complete signals hGate[r] (for r+1)
 
   struct Dwt53HCtx
   {
@@ -776,8 +791,6 @@ bool SchedulerFreebyrd::decodeAndTransformScx(ITileProcessor* tileProcessor)
     wav->horizPool_ = std::make_unique<dwt_scratch<int32_t>[]>(num_threads);
     wav->vertPool_ = std::make_unique<dwt_scratch<int32_t>[]>(num_threads);
 
-    size_t dwtSeqIdx = 0;
-
     for(uint8_t res = 1; res < cw.numRes; ++res)
     {
       wav->horiz_.sn = bandLL->width();
@@ -809,8 +822,13 @@ bool SchedulerFreebyrd::decodeAndTransformScx(ITileProcessor* tileProcessor)
         wav->vertPool_[i].mem = (int32_t*)waveletPoolData_->getVert(i);
       }
 
-      // H-DWT stripes (GATED on prerequisite T1 blocks)
+      // H-DWT stripes: gated on hGate, on_complete signals vGate
       uint32_t hDwtGate = gates.hDwtGateIds[res - 1];
+      uint32_t vDwtGate = gates.vDwtGateIds[res - 1];
+      // V-DWT on_complete signals next H-DWT gate (if not last resolution)
+      uint32_t vOnComplete =
+          (res < cw.numRes - 1) ? gates.hDwtGateIds[res] : SCX_NO_GATE;
+
       {
         Dwt53HCtx hctx;
         hctx.wavelet = wav;
@@ -849,13 +867,13 @@ bool SchedulerFreebyrd::decodeAndTransformScx(ITileProcessor* tileProcessor)
         {
           ds.h53.push_back(std::move(hctx));
           auto& ctx = ds.h53.back();
-          scx_engine_submit_gated_batch(engine, cw.dwtDomainId, dwtSeqIdx, ctx.stripes.size(),
-                                        Dwt53HCtx::execute, &ctx, hDwtGate);
-          dwtSeqIdx++;
+          // H-DWT: gated on T1 completion (+ V-DWT prev), on_complete signals V-DWT gate
+          scx_engine_submit_full_batch(engine, cw.dwtDomainId, 0, ctx.stripes.size(),
+                                       Dwt53HCtx::execute, &ctx, hDwtGate, vDwtGate);
         }
       }
 
-      // V-DWT stripes (sequence ordering ensures H-DWT done first)
+      // V-DWT stripes: gated on V-DWT gate (signaled by H-DWT on_complete)
       {
         DcShiftParam dcShift = (res == cw.numRes - 1) ? wav->dcShift_ : DcShiftParam{};
         auto winL = tileBuffer->getResWindowBufferSplitSimple(res, SPLIT_L);
@@ -880,15 +898,17 @@ bool SchedulerFreebyrd::decodeAndTransformScx(ITileProcessor* tileProcessor)
         {
           ds.v53.push_back(std::move(vctx));
           auto& ctx = ds.v53.back();
-          scx_engine_submit_batch(engine, cw.dwtDomainId, dwtSeqIdx, ctx.stripes.size(),
-                                  Dwt53VCtx::execute, &ctx);
-          dwtSeqIdx++;
+          // V-DWT: gated on H-DWT completion, on_complete signals next H-DWT gate
+          scx_engine_submit_full_batch(engine, cw.dwtDomainId, 0, ctx.stripes.size(),
+                                       Dwt53VCtx::execute, &ctx, vDwtGate, vOnComplete);
         }
       }
     }
   }
 
   // === Phase 5: Run engine (T1 + gated DWT interleave naturally) ===
+  // Working-wait: workers process T1 jobs while DWT gates are closed,
+  // then immediately start DWT work as gates open — no sequence barriers.
   scx_engine_run(engine);
 
   if(!success_)