GrokImageCompression
diff --git a/‎src/lib/core/scheduling/freebyrd/SchedulerFreebyrd.cpp‎
Lines changed: 218 additions & 11 deletions b/‎src/lib/core/scheduling/freebyrd/SchedulerFreebyrd.cpp‎
Lines changed: 218 additions & 11 deletions
@@ -603,8 +603,8 @@ bool SchedulerFreebyrd::decodeAndTransformScx(ITileProcessor* tileProcessor)
       continue;
 
     auto tilec = tileProcessor->getTile()->comps_ + compno;
-    // Only use strip-aware gates for whole-tile, non-16-bit
-    if(!tilec->isWholeTileDecoding() || tilec->is16BitDwt())
+    // Only use strip-aware gates for whole-tile decode
+    if(!tilec->isWholeTileDecoding())
       continue;
 
     auto& gates = compGates[compno];
@@ -1024,13 +1024,113 @@ bool SchedulerFreebyrd::decodeAndTransformScx(ITileProcessor* tileProcessor)
     }
   };
 
+  // Shared state per resolution for 16-bit (per-thread intermediate buffers)
+  struct FusedStripShared16
+  {
+    WaveletReverse* wavelet = nullptr;
+    uint32_t intermediateStride = 0;
+    uint32_t maxLRows = 0;
+    uint32_t maxHRows = 0;
+    std::vector<std::unique_ptr<int16_t[]>> threadBufL;
+    std::vector<std::unique_ptr<int16_t[]>> threadBufH;
+  };
+
+  // Per-strip job context for 16-bit (both 5/3 and 9/7)
+  struct FusedStripJob16
+  {
+    FusedStripShared16* shared;
+    ScxEngine* engine;
+    StripGeometry geom;
+    Buffer2dSimple<int16_t> llBand, hlBand, lhBand, hhBand;
+    Buffer2dSimple<int16_t> winDest;
+    uint32_t hSn, hDn, hParity;
+    uint32_t resWidth;
+    uint8_t qmfbid;
+    DcShiftParam dcShift;
+    std::vector<uint32_t> dependentGates;
+
+    static void execute([[maybe_unused]] size_t i, size_t thread_id, void* ud)
+    {
+      auto* job = static_cast<FusedStripJob16*>(ud);
+      auto* wav = job->shared->wavelet;
+
+      const uint32_t stride = job->shared->intermediateStride;
+      int16_t* tempL = job->shared->threadBufL[thread_id].get();
+      int16_t* tempH = job->shared->threadBufH[thread_id].get();
+
+      // === Step 1: H-DWT for L rows ===
+      {
+        wav->horizPool16_[thread_id].sn = job->hSn;
+        wav->horizPool16_[thread_id].dn = job->hDn;
+        wav->horizPool16_[thread_id].parity = job->hParity;
+
+        auto winL = job->llBand;
+        auto winH = job->hlBand;
+        winL.incY_IN_PLACE(job->geom.rangeL.lo);
+        winH.incY_IN_PLACE(job->geom.rangeL.lo);
+        Buffer2dSimple<int16_t> dest(tempL, stride, job->geom.rangeL.count());
+
+        if(job->qmfbid == 1)
+          wav->h_strip_16_53(&wav->horizPool16_[thread_id], 0, job->geom.rangeL.count(), winL, winH,
+                             dest);
+        else
+          wav->h_strip_16_97(&wav->horizPool16_[thread_id], 0, job->geom.rangeL.count(), winL, winH,
+                             dest);
+      }
+
+      // === Step 2: H-DWT for H rows ===
+      {
+        auto winL = job->lhBand;
+        auto winH = job->hhBand;
+        winL.incY_IN_PLACE(job->geom.rangeH.lo);
+        winH.incY_IN_PLACE(job->geom.rangeH.lo);
+        Buffer2dSimple<int16_t> dest(tempH, stride, job->geom.rangeH.count());
+
+        if(job->qmfbid == 1)
+          wav->h_strip_16_53(&wav->horizPool16_[thread_id], 0, job->geom.rangeH.count(), winL, winH,
+                             dest);
+        else
+          wav->h_strip_16_97(&wav->horizPool16_[thread_id], 0, job->geom.rangeH.count(), winL, winH,
+                             dest);
+      }
+
+      // === Step 3: Cascade V-DWT ===
+      {
+        uint32_t localSn = job->geom.rangeL.count();
+        uint32_t localDn = job->geom.rangeH.count();
+
+        wav->vertPool16_[thread_id].sn = localSn;
+        wav->vertPool16_[thread_id].dn = localDn;
+        wav->vertPool16_[thread_id].parity = job->geom.localParity;
+
+        Buffer2dSimple<int16_t> winL(tempL, stride, localSn);
+        Buffer2dSimple<int16_t> winH(tempH, stride, localDn);
+
+        if(job->qmfbid == 1)
+          wav->v_cascade_strip_16_53(&wav->vertPool16_[thread_id], 0, job->resWidth, winL, winH,
+                                     job->winDest, job->dcShift, job->geom.outputStartInStripe,
+                                     job->geom.outCount);
+        else
+          wav->v_cascade_strip_16_97(&wav->vertPool16_[thread_id], 0, job->resWidth, winL, winH,
+                                     job->winDest, job->dcShift, job->geom.outputStartInStripe,
+                                     job->geom.outCount);
+      }
+
+      // === Step 4: Signal downstream strip gates ===
+      for(auto gateId : job->dependentGates)
+        scx_engine_signal_gate(job->engine, gateId);
+    }
+  };
+
   struct CompDwtState
   {
     std::unique_ptr<WaveletReverse> wavelet;
-    std::vector<FusedStripShared> shared; // one per resolution (5/3)
-    std::vector<FusedStripJob> jobs; // all strip jobs (5/3)
-    std::vector<FusedStripShared97> shared97; // one per resolution (9/7)
-    std::vector<FusedStripJob97> jobs97; // all strip jobs (9/7)
+    std::vector<FusedStripShared> shared; // one per resolution (5/3 32-bit)
+    std::vector<FusedStripJob> jobs; // all strip jobs (5/3 32-bit)
+    std::vector<FusedStripShared97> shared97; // one per resolution (9/7 float)
+    std::vector<FusedStripJob97> jobs97; // all strip jobs (9/7 float)
+    std::vector<FusedStripShared16> shared16; // one per resolution (16-bit)
+    std::vector<FusedStripJob16> jobs16; // all strip jobs (16-bit)
     bool usedScxDwt = false;
   };
   std::vector<CompDwtState> dwtStates(numcomps_);
@@ -1045,8 +1145,8 @@ bool SchedulerFreebyrd::decodeAndTransformScx(ITileProcessor* tileProcessor)
     auto maxDim = std::max(tileProcessor->getCodingParams()->t_width_,
                            tileProcessor->getCodingParams()->t_height_);
 
-    // Don't use ScxEngine DWT for partial decompress or 16-bit (handled in fallback)
-    if(!tilec->isWholeTileDecoding() || tilec->is16BitDwt())
+    // Don't use ScxEngine DWT for partial decompress (handled in fallback)
+    if(!tilec->isWholeTileDecoding())
       continue;
 
     auto& ds = dwtStates[compno];
@@ -1063,11 +1163,117 @@ bool SchedulerFreebyrd::decodeAndTransformScx(ITileProcessor* tileProcessor)
     auto& gates = compGates[compno];
 
     auto bandLL = tilec->resolutions_;
-    auto tileBuffer = tilec->getWindow();
 
-    if(cw.qmfbid == 1)
+    if(tilec->is16BitDwt())
+    {
+      // === 16-bit DWT path (both 5/3 and 9/7) ===
+      auto tileBuffer16 = tilec->getWindow16();
+      wav->horizPool16_ = std::make_unique<dwt_scratch<int16_t>[]>(num_threads);
+      wav->vertPool16_ = std::make_unique<dwt_scratch<int16_t>[]>(num_threads);
+
+      ds.shared16.resize(cw.numRes - 1);
+
+      for(uint8_t res = 1; res < cw.numRes; ++res)
+      {
+        wav->horiz_.sn = bandLL->width();
+        wav->vert_.sn = bandLL->height();
+        for(uint32_t i = 0; i < num_threads; ++i)
+        {
+          wav->horizPool16_[i].sn = bandLL->width();
+          wav->vertPool16_[i].sn = bandLL->height();
+        }
+        ++bandLL;
+        auto resWidth = bandLL->width();
+        auto resHeight = bandLL->height();
+        if(resWidth == 0 || resHeight == 0)
+          continue;
+        wav->horiz_.dn = resWidth - wav->horiz_.sn;
+        wav->horiz_.parity = bandLL->x0 & 1;
+        wav->vert_.dn = resHeight - wav->vert_.sn;
+        wav->vert_.parity = bandLL->y0 & 1;
+        for(uint32_t i = 0; i < num_threads; ++i)
+        {
+          wav->horizPool16_[i].dn = resWidth - wav->horizPool16_[i].sn;
+          wav->horizPool16_[i].parity = bandLL->x0 & 1;
+          wav->horizPool16_[i].allocatedMem = (int16_t*)waveletPoolData_->getHoriz(i);
+          wav->horizPool16_[i].mem = (int16_t*)waveletPoolData_->getHoriz(i);
+
+          wav->vertPool16_[i].dn = resHeight - wav->vertPool16_[i].sn;
+          wav->vertPool16_[i].parity = bandLL->y0 & 1;
+          wav->vertPool16_[i].allocatedMem = (int16_t*)waveletPoolData_->getVert(i);
+          wav->vertPool16_[i].mem = (int16_t*)waveletPoolData_->getVert(i);
+        }
+
+        auto& stripGeoms = gates.stripGeoms[res - 1];
+        if(stripGeoms.empty())
+          continue;
+
+        uint32_t intermediateStride = (resWidth + 15U) & ~15U;
+
+        uint32_t maxLRows = 0, maxHRows = 0;
+        for(auto& sg : stripGeoms)
+        {
+          maxLRows = std::max(maxLRows, sg.rangeL.count());
+          maxHRows = std::max(maxHRows, sg.rangeH.count());
+        }
+
+        auto& sh = ds.shared16[res - 1];
+        sh.wavelet = wav;
+        sh.intermediateStride = intermediateStride;
+        sh.maxLRows = maxLRows;
+        sh.maxHRows = maxHRows;
+        sh.threadBufL.resize(num_threads);
+        sh.threadBufH.resize(num_threads);
+        for(uint32_t t = 0; t < num_threads; ++t)
+        {
+          sh.threadBufL[t] = std::make_unique<int16_t[]>((size_t)intermediateStride * maxLRows);
+          sh.threadBufH[t] = std::make_unique<int16_t[]>((size_t)intermediateStride * maxHRows);
+        }
+
+        DcShiftParam dcShift = (res == cw.numRes - 1) ? wav->dcShift_ : DcShiftParam{};
+
+        auto llBand = tileBuffer16->getResWindowBufferSimple((uint8_t)(res - 1U));
+        auto hlBand = tileBuffer16->getBandWindowBufferPaddedSimple(res, t1::BAND_ORIENT_HL);
+        auto lhBand = tileBuffer16->getBandWindowBufferPaddedSimple(res, t1::BAND_ORIENT_LH);
+        auto hhBand = tileBuffer16->getBandWindowBufferPaddedSimple(res, t1::BAND_ORIENT_HH);
+        auto winDest = tileBuffer16->getResWindowBufferSimple(res);
+
+        size_t jobBase = ds.jobs16.size();
+
+        for(size_t s = 0; s < stripGeoms.size(); ++s)
+        {
+          FusedStripJob16 job;
+          job.shared = &sh;
+          job.engine = engine;
+          job.geom = stripGeoms[s];
+          job.llBand = llBand;
+          job.hlBand = hlBand;
+          job.lhBand = lhBand;
+          job.hhBand = hhBand;
+          job.winDest = winDest;
+          job.winDest.incY_IN_PLACE(stripGeoms[s].outStart);
+          job.hSn = wav->horiz_.sn;
+          job.hDn = wav->horiz_.dn;
+          job.hParity = wav->horiz_.parity;
+          job.resWidth = resWidth;
+          job.qmfbid = cw.qmfbid;
+          job.dcShift = dcShift;
+          job.dependentGates = gates.crossResDepGates[res - 1][s];
+          ds.jobs16.push_back(std::move(job));
+        }
+
+        for(size_t s = 0; s < stripGeoms.size(); ++s)
+        {
+          uint32_t stripGate = gates.stripGateIds[res - 1][s];
+          scx_engine_submit_full_batch(engine, cw.dwtDomainId, 0, 1, FusedStripJob16::execute,
+                                       &ds.jobs16[jobBase + s], stripGate, SCX_NO_GATE);
+        }
+      }
+    }
+    else if(cw.qmfbid == 1)
     {
-      // === 5/3 reversible DWT path ===
+      // === 5/3 reversible DWT path (32-bit) ===
+      auto tileBuffer = tilec->getWindow();
       wav->horizPool_ = std::make_unique<dwt_scratch<int32_t>[]>(num_threads);
       wav->vertPool_ = std::make_unique<dwt_scratch<int32_t>[]>(num_threads);
 
@@ -1179,6 +1385,7 @@ bool SchedulerFreebyrd::decodeAndTransformScx(ITileProcessor* tileProcessor)
     else
     {
       // === 9/7 irreversible DWT path ===
+      auto tileBuffer = tilec->getWindow();
       ds.shared97.resize(cw.numRes - 1);
 
       for(uint8_t res = 1; res < cw.numRes; ++res)