Skip to content

Commit 5c83e2e

Browse files
author
Grok Compression
committed
Implement interleaved T1+DWT in freebyrd scheduler (Phase 4)
Non-SCX path now uses a per-component DAG that interleaves T1 decode and DWT per resolution level, instead of decoding ALL blocks first then running ALL DWT. TaskFlow manages dependencies: - T1 blocks at res N complete before DWT at res N starts - DWT at res N completes before T1 at res N+1 starts This matches the sicorax domain-sequence approach: work is organized by resolution level so that DWT can begin as soon as its input data (decoded subbands + previous level output) is available. SCX path (Phase 3) remains unchanged: sequential T1 → DWT. Test results: 437/437 decode tests pass (zero regressions).
1 parent be42875 commit 5c83e2e

2 files changed

Lines changed: 173 additions & 2 deletions

File tree

src/lib/core/scheduling/freebyrd/SchedulerFreebyrd.cpp

Lines changed: 170 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,6 @@ class DwtFlowHelper : public SchedulerStandard
8989
imageComponentFlow_[compno] = new ImageComponentFlow(numRes);
9090
if(regionDecompress)
9191
imageComponentFlow_[compno]->setRegionDecompression();
92-
// addTo must be called before graph to initialize composition tasks
9392
imageComponentFlow_[compno]->addTo(*this);
9493
SchedulerStandard::graph(compno);
9594
}
@@ -116,11 +115,17 @@ bool SchedulerFreebyrd::decompressTile(ITileProcessor* tileProcessor)
116115
{
117116
success_ = true;
118117

118+
#ifdef GRK_USE_SCX_SCHEDULING
119+
// SCX path: sequential T1 → DWT (Phase 3)
119120
if(!decodeBlocks(tileProcessor))
120121
return false;
121-
122122
if(!runDWT(tileProcessor))
123123
return false;
124+
#else
125+
// Interleaved T1+DWT: per-component DAG with T1 and DWT tasks (Phase 4)
126+
if(!decodeAndTransform(tileProcessor))
127+
return false;
128+
#endif
124129

125130
if(!postProcess(tileProcessor))
126131
return false;
@@ -377,6 +382,169 @@ bool SchedulerFreebyrd::runDWT(ITileProcessor* tileProcessor)
377382
return true;
378383
}
379384

385+
#ifndef GRK_USE_SCX_SCHEDULING
386+
bool SchedulerFreebyrd::decodeAndTransform(ITileProcessor* tileProcessor)
387+
{
388+
auto tcp = tileProcessor->getTCP();
389+
bool cacheAll =
390+
(tileProcessor->getTileCacheStrategy() & GRK_TILE_CACHE_ALL) == GRK_TILE_CACHE_ALL;
391+
uint32_t num_threads = (uint32_t)TFSingleton::num_threads();
392+
bool finalLayer = tcp->layersToDecompress_ == tcp->numLayers_;
393+
394+
for(uint16_t compno = 0; compno < numcomps_; ++compno)
395+
{
396+
if(!tileProcessor->shouldDecodeComponent(compno))
397+
continue;
398+
399+
auto tccp = tcp->tccps_ + compno;
400+
uint16_t cbw = tccp->cblkw_expn_ ? (uint16_t)(1 << tccp->cblkw_expn_) : 0U;
401+
uint16_t cbh = tccp->cblkh_expn_ ? (uint16_t)(1 << tccp->cblkh_expn_) : 0U;
402+
auto activePool = &coderPool_;
403+
if(streamPool_ && streamPool_->contains(tccp->cblkw_expn_, tccp->cblkh_expn_))
404+
activePool = streamPool_;
405+
406+
if(!cacheAll)
407+
{
408+
activePool->makeCoders(
409+
num_threads, tccp->cblkw_expn_, tccp->cblkh_expn_,
410+
[tcp, cbw, cbh, tileProcessor]() -> std::shared_ptr<t1::ICoder> {
411+
return std::shared_ptr<t1::ICoder>(t1::CoderFactory::makeCoder(
412+
tcp->isHT(), false, cbw, cbh, tileProcessor->getTileCacheStrategy()));
413+
});
414+
}
415+
416+
auto tilec = tileProcessor->getTile()->comps_ + compno;
417+
auto wholeTileDecoding = tilec->isWholeTileDecoding();
418+
uint8_t resBegin =
419+
cacheAll ? (uint8_t)tilec->currentPacketProgressionState_.numResolutionsRead() : 0;
420+
uint8_t resUpperBound = tilec->nextPacketProgressionState_.numResolutionsRead();
421+
uint8_t numRes = tilec->nextPacketProgressionState_.numResolutionsRead();
422+
423+
if(numRes == 0)
424+
continue;
425+
426+
// 1. Set up ImageComponentFlow for this component
427+
dwtHelper_->release();
428+
dwtHelper_->clear();
429+
dwtHelper_->setupComponentFlow(compno, numRes, !wholeTileDecoding);
430+
431+
// 2. Collect blocks per resolution and schedule T1 decode into flow
432+
// Combine first two resolution levels (0+1) into a single flow slot (like standard scheduler)
433+
uint8_t flowResIdx = 0;
434+
for(uint8_t resno = resBegin; resno < resUpperBound; ++resno)
435+
{
436+
auto res = tilec->resolutions_ + resno;
437+
for(uint8_t bandIndex = 0; bandIndex < res->numBands_; ++bandIndex)
438+
{
439+
auto band = res->band + bandIndex;
440+
auto paddedBandWindow = tilec->getBandWindowPadded(resno, band->orientation_);
441+
for(auto precinct : band->precincts_)
442+
{
443+
if(!wholeTileDecoding && !paddedBandWindow->nonEmptyIntersection(precinct))
444+
continue;
445+
for(uint32_t cblkno = 0; cblkno < precinct->getNumCblks(); ++cblkno)
446+
{
447+
auto cblkBounds = precinct->getCodeBlockBounds(cblkno);
448+
if(!wholeTileDecoding && !paddedBandWindow->nonEmptyIntersection(&cblkBounds))
449+
continue;
450+
451+
auto cblk = precinct->getDecompressBlock(cblkno);
452+
auto block = std::make_shared<t1::DecompressBlockExec>(cacheAll);
453+
block->x = cblk->x0();
454+
block->y = cblk->y0();
455+
block->postProcessor_ =
456+
tcp->isHT() ? t1::DecompressBlockPostProcessor<int32_t>(
457+
[tilec](int32_t* srcData, t1::DecompressBlockExec* blk,
458+
uint16_t stride) {
459+
tilec->postProcessBlockHT(srcData, blk, stride);
460+
})
461+
: t1::DecompressBlockPostProcessor<int32_t>(
462+
[tilec](int32_t* srcData, t1::DecompressBlockExec* blk,
463+
[[maybe_unused]] uint16_t stride) {
464+
tilec->postProcessBlock(srcData, blk);
465+
});
466+
block->bandIndex = bandIndex;
467+
block->bandNumbps = band->maxBitPlanes_;
468+
block->bandOrientation = band->orientation_;
469+
block->cblk = cblk;
470+
block->cblk_sty = tccp->cblkStyle_;
471+
block->qmfbid = tccp->qmfbid_;
472+
block->resno = resno;
473+
block->roishift = tccp->roishift_;
474+
block->stepsize = band->stepsize_;
475+
block->k_msbs = (uint8_t)(band->maxBitPlanes_ - cblk->numbps());
476+
block->R_b = prec_ + gain_b[band->orientation_];
477+
block->finalLayer_ = finalLayer;
478+
479+
// Schedule T1 decode into the flow
480+
auto imageComponentFlow = dwtHelper_->getImageComponentFlow(compno);
481+
auto resFlow = imageComponentFlow->getResflow(flowResIdx);
482+
resFlow->blocks_->nextTask().work(
483+
[this, activePool, block, tccp, cbw, cbh, cacheAll, tileProcessor]() {
484+
if(!success_)
485+
{
486+
block.reset();
487+
return;
488+
}
489+
t1::ICoder* coder = nullptr;
490+
if(block->needsCachedCoder())
491+
{
492+
coder = t1::CoderFactory::makeCoder(tileProcessor->getTCP()->isHT(), false, cbw,
493+
cbh, tileProcessor->getTileCacheStrategy());
494+
}
495+
else if(!cacheAll)
496+
{
497+
auto threadnum = TFSingleton::get().this_worker_id();
498+
coder =
499+
activePool->getCoder((size_t)threadnum, tccp->cblkw_expn_, tccp->cblkh_expn_)
500+
.get();
501+
}
502+
try
503+
{
504+
if(!block->open(coder))
505+
success_ = false;
506+
}
507+
catch(const std::runtime_error& rerr)
508+
{
509+
grklog.error(rerr.what());
510+
success_ = false;
511+
}
512+
});
513+
}
514+
}
515+
}
516+
// Combine res 0 with res 1 into the same flow slot (same as standard scheduler)
517+
if(resno == 0 && resUpperBound > 1)
518+
continue;
519+
flowResIdx++;
520+
}
521+
tilec->currentPacketProgressionState_ = tilec->nextPacketProgressionState_;
522+
523+
if(!success_)
524+
return false;
525+
526+
// 3. Schedule DWT (if more than 1 resolution)
527+
if(numRes > 1)
528+
{
529+
auto maxDim = std::max(tileProcessor->getCodingParams()->t_width_,
530+
tileProcessor->getCodingParams()->t_height_);
531+
532+
WaveletReverse wavelet(dwtHelper_, tilec, compno, tilec->windowUnreducedBounds(), numRes,
533+
tccp->qmfbid_, maxDim, tcp->wholeTileDecompress_, waveletPoolData_);
534+
if(!wavelet.decompress())
535+
return false;
536+
}
537+
538+
// 4. Run the complete T1+DWT flow
539+
TFSingleton::get().run(*dwtHelper_).wait();
540+
dwtHelper_->release();
541+
dwtHelper_->clear();
542+
}
543+
544+
return success_;
545+
}
546+
#endif
547+
380548
bool SchedulerFreebyrd::runCascadeDWT97([[maybe_unused]] ITileProcessor* tileProcessor,
381549
[[maybe_unused]] uint16_t compno)
382550
{

src/lib/core/scheduling/freebyrd/SchedulerFreebyrd.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,9 @@ class SchedulerFreebyrd
8888
bool runSeparateDWT53(ITileProcessor* tileProcessor, uint16_t compno);
8989
bool runSeparateDWT16(ITileProcessor* tileProcessor, uint16_t compno);
9090
bool postProcess(ITileProcessor* tileProcessor);
91+
#ifndef GRK_USE_SCX_SCHEDULING
92+
bool decodeAndTransform(ITileProcessor* tileProcessor);
93+
#endif
9194

9295
uint16_t numcomps_;
9396
uint8_t prec_;

0 commit comments

Comments
 (0)