@@ -557,14 +557,21 @@ bool SchedulerFreebyrd::decodeAndTransformScx(ITileProcessor* tileProcessor)
557557 }
558558
559559 // === Phase 2: Create gates for DWT and assign to T1 blocks ===
560- // Gate for H-DWT at resolution R:
561- // R==1: signaled by T1 blocks at res 0 AND res 1
562- // R>1: signaled by T1 blocks at res R
563- // (V-DWT dependency on lower-res result handled by sequence ordering in DWT domain)
560+ // Working-wait architecture: all DWT work in a single sequence (seq 0),
561+ // ordering enforced purely by gates and on_complete callbacks.
562+ //
563+ // Gate chain per component:
564+ // T1 blocks at res 0+1 → hGate[0] → H-DWT res 1 → (on_complete) → vGate[0] → V-DWT res 1
565+ // → (on_complete) → hGate[1] → H-DWT res 2 → (on_complete) → vGate[1] → V-DWT res 2 → ...
566+ //
567+ // hGate[0]: count = T1 blocks at res 0 + res 1 (signaled by T1 callbacks)
568+ // hGate[R>0]: count = T1 blocks at res R+1 + 1 (T1 callbacks + V-DWT prev on_complete)
569+ // vGate[R]: count = 1 (signaled by H-DWT batch on_complete)
564570
565571 struct CompGates
566572 {
567573 std::vector<uint32_t > hDwtGateIds; // hDwtGateIds[r-1] = gate for H-DWT at resolution r
574+ std::vector<uint32_t > vDwtGateIds; // vDwtGateIds[r-1] = gate for V-DWT at resolution r
568575 };
569576 std::vector<CompGates> compGates (numcomps_);
570577
@@ -587,14 +594,20 @@ bool SchedulerFreebyrd::decodeAndTransformScx(ITileProcessor* tileProcessor)
587594 // Create gates for each DWT resolution
588595 auto & gates = compGates[compno];
589596 gates.hDwtGateIds .resize (cw.numRes - 1 );
597+ gates.vDwtGateIds .resize (cw.numRes - 1 );
598+
590599 for (uint8_t r = 1 ; r < cw.numRes ; ++r)
591600 {
592- size_t gateCount;
601+ // H-DWT gate: signaled by T1 block completions + V-DWT(r-1) on_complete (for r>1)
602+ size_t hGateCount;
593603 if (r == 1 )
594- gateCount = blocksPerRes[0 ] + blocksPerRes[1 ]; // LL + all bands at res 1
604+ hGateCount = blocksPerRes[0 ] + blocksPerRes[1 ]; // LL + all bands at res 1
595605 else
596- gateCount = blocksPerRes[r]; // bands at this res
597- gates.hDwtGateIds [r - 1 ] = scx_engine_create_gate (engine, gateCount);
606+ hGateCount = blocksPerRes[r] + 1 ; // T1 blocks at res R + 1 signal from V-DWT(r-1)
607+ gates.hDwtGateIds [r - 1 ] = scx_engine_create_gate (engine, hGateCount);
608+
609+ // V-DWT gate: signaled once when H-DWT batch at this res completes
610+ gates.vDwtGateIds [r - 1 ] = scx_engine_create_gate (engine, 1 );
598611 }
599612
600613 // Assign gate IDs to T1 blocks
@@ -683,9 +696,11 @@ bool SchedulerFreebyrd::decodeAndTransformScx(ITileProcessor* tileProcessor)
683696 &t1Contexts[compno]);
684697 }
685698
686- // === Phase 4: Submit DWT work (DWT domain, gated sequences) ===
687- // DWT domain sequences: seq 0 = H-DWT res 1 (gated), seq 1 = V-DWT res 1,
688- // seq 2 = H-DWT res 2 (gated), seq 3 = V-DWT res 2, ...
699+ // === Phase 4: Submit DWT work (DWT domain, all in seq 0, gate-chained) ===
700+ // Working-wait model: all DWT batches in a single sequence.
701+ // Ordering enforced by gates + on_complete callbacks:
702+ // H-DWT(r): gated on hGate[r-1], on_complete signals vGate[r-1]
703+ // V-DWT(r): gated on vGate[r-1], on_complete signals hGate[r] (for r+1)
689704
690705 struct Dwt53HCtx
691706 {
@@ -776,8 +791,6 @@ bool SchedulerFreebyrd::decodeAndTransformScx(ITileProcessor* tileProcessor)
776791 wav->horizPool_ = std::make_unique<dwt_scratch<int32_t >[]>(num_threads);
777792 wav->vertPool_ = std::make_unique<dwt_scratch<int32_t >[]>(num_threads);
778793
779- size_t dwtSeqIdx = 0 ;
780-
781794 for (uint8_t res = 1 ; res < cw.numRes ; ++res)
782795 {
783796 wav->horiz_ .sn = bandLL->width ();
@@ -809,8 +822,13 @@ bool SchedulerFreebyrd::decodeAndTransformScx(ITileProcessor* tileProcessor)
809822 wav->vertPool_ [i].mem = (int32_t *)waveletPoolData_->getVert (i);
810823 }
811824
812- // H-DWT stripes (GATED on prerequisite T1 blocks)
825+ // H-DWT stripes: gated on hGate, on_complete signals vGate
813826 uint32_t hDwtGate = gates.hDwtGateIds [res - 1 ];
827+ uint32_t vDwtGate = gates.vDwtGateIds [res - 1 ];
828+ // V-DWT on_complete signals next H-DWT gate (if not last resolution)
829+ uint32_t vOnComplete =
830+ (res < cw.numRes - 1 ) ? gates.hDwtGateIds [res] : SCX_NO_GATE;
831+
814832 {
815833 Dwt53HCtx hctx;
816834 hctx.wavelet = wav;
@@ -849,13 +867,13 @@ bool SchedulerFreebyrd::decodeAndTransformScx(ITileProcessor* tileProcessor)
849867 {
850868 ds.h53 .push_back (std::move (hctx));
851869 auto & ctx = ds.h53 .back ();
852- scx_engine_submit_gated_batch (engine, cw. dwtDomainId , dwtSeqIdx, ctx. stripes . size (),
853- Dwt53HCtx::execute, &ctx, hDwtGate);
854- dwtSeqIdx++ ;
870+ // H-DWT: gated on T1 completion (+ V-DWT prev), on_complete signals V-DWT gate
871+ scx_engine_submit_full_batch (engine, cw. dwtDomainId , 0 , ctx. stripes . size (),
872+ Dwt53HCtx::execute, &ctx, hDwtGate, vDwtGate) ;
855873 }
856874 }
857875
858- // V-DWT stripes (sequence ordering ensures H-DWT done first )
876+ // V-DWT stripes: gated on V-DWT gate (signaled by H-DWT on_complete )
859877 {
860878 DcShiftParam dcShift = (res == cw.numRes - 1 ) ? wav->dcShift_ : DcShiftParam{};
861879 auto winL = tileBuffer->getResWindowBufferSplitSimple (res, SPLIT_L);
@@ -880,15 +898,17 @@ bool SchedulerFreebyrd::decodeAndTransformScx(ITileProcessor* tileProcessor)
880898 {
881899 ds.v53 .push_back (std::move (vctx));
882900 auto & ctx = ds.v53 .back ();
883- scx_engine_submit_batch (engine, cw. dwtDomainId , dwtSeqIdx, ctx. stripes . size (),
884- Dwt53VCtx::execute, & ctx);
885- dwtSeqIdx++ ;
901+ // V-DWT: gated on H-DWT completion, on_complete signals next H-DWT gate
902+ scx_engine_submit_full_batch (engine, cw. dwtDomainId , 0 , ctx. stripes . size (),
903+ Dwt53VCtx::execute, &ctx, vDwtGate, vOnComplete) ;
886904 }
887905 }
888906 }
889907 }
890908
891909 // === Phase 5: Run engine (T1 + gated DWT interleave naturally) ===
910+ // Working-wait: workers process T1 jobs while DWT gates are closed,
911+ // then immediately start DWT work as gates open — no sequence barriers.
892912 scx_engine_run (engine);
893913
894914 if (!success_)
0 commit comments