Implement interleaved T1+DWT in freebyrd scheduler (Phase 4)

Grok Compression · Grok Compression · commit 5c83e2ecc309 · 2026-05-16T23:16:55.000-04:00
Non-SCX path now uses a per-component DAG that interleaves T1 decode
and DWT per resolution level, instead of decoding ALL blocks first
then running ALL DWT. TaskFlow manages dependencies:
- T1 blocks at res N complete before DWT at res N starts
- DWT at res N completes before T1 at res N+1 starts

This matches the sicorax domain-sequence approach: work is organized
by resolution level so that DWT can begin as soon as its input data
(decoded subbands + previous level output) is available.

SCX path (Phase 3) remains unchanged: sequential T1 → DWT.

Test results: 437/437 decode tests pass (zero regressions).
diff --git a/src/lib/core/scheduling/freebyrd/SchedulerFreebyrd.cpp b/src/lib/core/scheduling/freebyrd/SchedulerFreebyrd.cpp
@@ -89,7 +89,6 @@ class DwtFlowHelper : public SchedulerStandard
     imageComponentFlow_[compno] = new ImageComponentFlow(numRes);
     if(regionDecompress)
       imageComponentFlow_[compno]->setRegionDecompression();
-    // addTo must be called before graph to initialize composition tasks
     imageComponentFlow_[compno]->addTo(*this);
     SchedulerStandard::graph(compno);
   }
@@ -116,11 +115,17 @@ bool SchedulerFreebyrd::decompressTile(ITileProcessor* tileProcessor)
 {
   success_ = true;
 
+#ifdef GRK_USE_SCX_SCHEDULING
+  // SCX path: sequential T1 → DWT (Phase 3)
   if(!decodeBlocks(tileProcessor))
     return false;
-
   if(!runDWT(tileProcessor))
     return false;
+#else
+  // Interleaved T1+DWT: per-component DAG with T1 and DWT tasks (Phase 4)
+  if(!decodeAndTransform(tileProcessor))
+    return false;
+#endif
 
   if(!postProcess(tileProcessor))
     return false;
@@ -377,6 +382,169 @@ bool SchedulerFreebyrd::runDWT(ITileProcessor* tileProcessor)
   return true;
 }
 
+#ifndef GRK_USE_SCX_SCHEDULING
+bool SchedulerFreebyrd::decodeAndTransform(ITileProcessor* tileProcessor)
+{
+  auto tcp = tileProcessor->getTCP();
+  bool cacheAll =
+      (tileProcessor->getTileCacheStrategy() & GRK_TILE_CACHE_ALL) == GRK_TILE_CACHE_ALL;
+  uint32_t num_threads = (uint32_t)TFSingleton::num_threads();
+  bool finalLayer = tcp->layersToDecompress_ == tcp->numLayers_;
+
+  for(uint16_t compno = 0; compno < numcomps_; ++compno)
+  {
+    if(!tileProcessor->shouldDecodeComponent(compno))
+      continue;
+
+    auto tccp = tcp->tccps_ + compno;
+    uint16_t cbw = tccp->cblkw_expn_ ? (uint16_t)(1 << tccp->cblkw_expn_) : 0U;
+    uint16_t cbh = tccp->cblkh_expn_ ? (uint16_t)(1 << tccp->cblkh_expn_) : 0U;
+    auto activePool = &coderPool_;
+    if(streamPool_ && streamPool_->contains(tccp->cblkw_expn_, tccp->cblkh_expn_))
+      activePool = streamPool_;
+
+    if(!cacheAll)
+    {
+      activePool->makeCoders(
+          num_threads, tccp->cblkw_expn_, tccp->cblkh_expn_,
+          [tcp, cbw, cbh, tileProcessor]() -> std::shared_ptr<t1::ICoder> {
+            return std::shared_ptr<t1::ICoder>(t1::CoderFactory::makeCoder(
+                tcp->isHT(), false, cbw, cbh, tileProcessor->getTileCacheStrategy()));
+          });
+    }
+
+    auto tilec = tileProcessor->getTile()->comps_ + compno;
+    auto wholeTileDecoding = tilec->isWholeTileDecoding();
+    uint8_t resBegin =
+        cacheAll ? (uint8_t)tilec->currentPacketProgressionState_.numResolutionsRead() : 0;
+    uint8_t resUpperBound = tilec->nextPacketProgressionState_.numResolutionsRead();
+    uint8_t numRes = tilec->nextPacketProgressionState_.numResolutionsRead();
+
+    if(numRes == 0)
+      continue;
+
+    // 1. Set up ImageComponentFlow for this component
+    dwtHelper_->release();
+    dwtHelper_->clear();
+    dwtHelper_->setupComponentFlow(compno, numRes, !wholeTileDecoding);
+
+    // 2. Collect blocks per resolution and schedule T1 decode into flow
+    // Combine first two resolution levels (0+1) into a single flow slot (like standard scheduler)
+    uint8_t flowResIdx = 0;
+    for(uint8_t resno = resBegin; resno < resUpperBound; ++resno)
+    {
+      auto res = tilec->resolutions_ + resno;
+      for(uint8_t bandIndex = 0; bandIndex < res->numBands_; ++bandIndex)
+      {
+        auto band = res->band + bandIndex;
+        auto paddedBandWindow = tilec->getBandWindowPadded(resno, band->orientation_);
+        for(auto precinct : band->precincts_)
+        {
+          if(!wholeTileDecoding && !paddedBandWindow->nonEmptyIntersection(precinct))
+            continue;
+          for(uint32_t cblkno = 0; cblkno < precinct->getNumCblks(); ++cblkno)
+          {
+            auto cblkBounds = precinct->getCodeBlockBounds(cblkno);
+            if(!wholeTileDecoding && !paddedBandWindow->nonEmptyIntersection(&cblkBounds))
+              continue;
+
+            auto cblk = precinct->getDecompressBlock(cblkno);
+            auto block = std::make_shared<t1::DecompressBlockExec>(cacheAll);
+            block->x = cblk->x0();
+            block->y = cblk->y0();
+            block->postProcessor_ =
+                tcp->isHT() ? t1::DecompressBlockPostProcessor<int32_t>(
+                                  [tilec](int32_t* srcData, t1::DecompressBlockExec* blk,
+                                          uint16_t stride) {
+                                    tilec->postProcessBlockHT(srcData, blk, stride);
+                                  })
+                            : t1::DecompressBlockPostProcessor<int32_t>(
+                                  [tilec](int32_t* srcData, t1::DecompressBlockExec* blk,
+                                          [[maybe_unused]] uint16_t stride) {
+                                    tilec->postProcessBlock(srcData, blk);
+                                  });
+            block->bandIndex = bandIndex;
+            block->bandNumbps = band->maxBitPlanes_;
+            block->bandOrientation = band->orientation_;
+            block->cblk = cblk;
+            block->cblk_sty = tccp->cblkStyle_;
+            block->qmfbid = tccp->qmfbid_;
+            block->resno = resno;
+            block->roishift = tccp->roishift_;
+            block->stepsize = band->stepsize_;
+            block->k_msbs = (uint8_t)(band->maxBitPlanes_ - cblk->numbps());
+            block->R_b = prec_ + gain_b[band->orientation_];
+            block->finalLayer_ = finalLayer;
+
+            // Schedule T1 decode into the flow
+            auto imageComponentFlow = dwtHelper_->getImageComponentFlow(compno);
+            auto resFlow = imageComponentFlow->getResflow(flowResIdx);
+            resFlow->blocks_->nextTask().work(
+                [this, activePool, block, tccp, cbw, cbh, cacheAll, tileProcessor]() {
+                  if(!success_)
+                  {
+                    block.reset();
+                    return;
+                  }
+                  t1::ICoder* coder = nullptr;
+                  if(block->needsCachedCoder())
+                  {
+                    coder = t1::CoderFactory::makeCoder(tileProcessor->getTCP()->isHT(), false, cbw,
+                                                       cbh, tileProcessor->getTileCacheStrategy());
+                  }
+                  else if(!cacheAll)
+                  {
+                    auto threadnum = TFSingleton::get().this_worker_id();
+                    coder =
+                        activePool->getCoder((size_t)threadnum, tccp->cblkw_expn_, tccp->cblkh_expn_)
+                            .get();
+                  }
+                  try
+                  {
+                    if(!block->open(coder))
+                      success_ = false;
+                  }
+                  catch(const std::runtime_error& rerr)
+                  {
+                    grklog.error(rerr.what());
+                    success_ = false;
+                  }
+                });
+          }
+        }
+      }
+      // Combine res 0 with res 1 into the same flow slot (same as standard scheduler)
+      if(resno == 0 && resUpperBound > 1)
+        continue;
+      flowResIdx++;
+    }
+    tilec->currentPacketProgressionState_ = tilec->nextPacketProgressionState_;
+
+    if(!success_)
+      return false;
+
+    // 3. Schedule DWT (if more than 1 resolution)
+    if(numRes > 1)
+    {
+      auto maxDim = std::max(tileProcessor->getCodingParams()->t_width_,
+                             tileProcessor->getCodingParams()->t_height_);
+
+      WaveletReverse wavelet(dwtHelper_, tilec, compno, tilec->windowUnreducedBounds(), numRes,
+                             tccp->qmfbid_, maxDim, tcp->wholeTileDecompress_, waveletPoolData_);
+      if(!wavelet.decompress())
+        return false;
+    }
+
+    // 4. Run the complete T1+DWT flow
+    TFSingleton::get().run(*dwtHelper_).wait();
+    dwtHelper_->release();
+    dwtHelper_->clear();
+  }
+
+  return success_;
+}
+#endif
+
 bool SchedulerFreebyrd::runCascadeDWT97([[maybe_unused]] ITileProcessor* tileProcessor,
                                         [[maybe_unused]] uint16_t compno)
 {
diff --git a/src/lib/core/scheduling/freebyrd/SchedulerFreebyrd.h b/src/lib/core/scheduling/freebyrd/SchedulerFreebyrd.h
@@ -88,6 +88,9 @@ class SchedulerFreebyrd
   bool runSeparateDWT53(ITileProcessor* tileProcessor, uint16_t compno);
   bool runSeparateDWT16(ITileProcessor* tileProcessor, uint16_t compno);
   bool postProcess(ITileProcessor* tileProcessor);
+#ifndef GRK_USE_SCX_SCHEDULING
+  bool decodeAndTransform(ITileProcessor* tileProcessor);
+#endif
 
   uint16_t numcomps_;
   uint8_t prec_;