diff --git a/cpp/pixels-retina/include/RGVisibility.h b/cpp/pixels-retina/include/RGVisibility.h index f1f29a0147..144cb4833a 100644 --- a/cpp/pixels-retina/include/RGVisibility.h +++ b/cpp/pixels-retina/include/RGVisibility.h @@ -21,19 +21,23 @@ #define RG_VISIBILITY_H #include "RetinaBase.h" #include "TileVisibility.h" +#include #include template class RGVisibility : public pixels::RetinaBase> { public: - explicit RGVisibility(uint64_t rgRecordNum); - explicit RGVisibility(uint64_t rgRecordNum, uint64_t timestamp, const std::vector& initialBitmap); + explicit RGVisibility(uint64_t rgRecordNum, uint64_t timestamp = 0, + const std::vector* initialBitmap = nullptr); ~RGVisibility() override; void deleteRGRecord(uint32_t rowId, uint64_t timestamp); uint64_t* getRGVisibilityBitmap(uint64_t timestamp); - void collectRGGarbage(uint64_t timestamp); + std::vector collectRGGarbage(uint64_t timestamp); + + std::vector exportChainItemsAfter(uint64_t safeGcTs) const; + void importDeletionChain(const uint64_t* items, size_t pairCount); uint64_t getBitmapSize() const; diff --git a/cpp/pixels-retina/include/RGVisibilityJni.h b/cpp/pixels-retina/include/RGVisibilityJni.h index 37339942d4..c8bb1fc3a5 100644 --- a/cpp/pixels-retina/include/RGVisibilityJni.h +++ b/cpp/pixels-retina/include/RGVisibilityJni.h @@ -10,17 +10,9 @@ extern "C" { /* * Class: io_pixelsdb_pixels_retina_RGVisibility * Method: createNativeObject - * Signature: (J)J - */ -JNIEXPORT jlong JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_createNativeObject - (JNIEnv *, jobject, jlong); - -/* - * Class: io_pixelsdb_pixels_retina_RGVisibility - * Method: createNativeObjectInitialized * Signature: (JJ[J)J */ -JNIEXPORT jlong JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_createNativeObjectInitialized +JNIEXPORT jlong JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_createNativeObject (JNIEnv *, jobject, jlong, jlong, jlongArray); /* @@ -50,11 +42,27 @@ JNIEXPORT jlongArray JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_getVisi /* * Class: io_pixelsdb_pixels_retina_RGVisibility * Method: garbageCollect - * Signature: (JJ)V + * Signature: (JJ)[J + */ +JNIEXPORT jlongArray JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_garbageCollect + (JNIEnv *, jobject, jlong, jlong); + +/* + * Class: io_pixelsdb_pixels_retina_RGVisibility + * Method: exportChainItemsAfter + * Signature: (JJ)[J */ -JNIEXPORT void JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_garbageCollect +JNIEXPORT jlongArray JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_exportChainItemsAfter (JNIEnv *, jobject, jlong, jlong); +/* + * Class: io_pixelsdb_pixels_retina_RGVisibility + * Method: importDeletionChain + * Signature: ([JJ)V + */ +JNIEXPORT void JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_importDeletionChain + (JNIEnv *, jobject, jlongArray, jlong); + /* * Class: io_pixelsdb_pixels_retina_RGVisibility * Method: getNativeMemoryUsage diff --git a/cpp/pixels-retina/include/TileVisibility.h b/cpp/pixels-retina/include/TileVisibility.h index a7ac6c790d..ef9bd59143 100644 --- a/cpp/pixels-retina/include/TileVisibility.h +++ b/cpp/pixels-retina/include/TileVisibility.h @@ -32,6 +32,7 @@ #include #include #include +#include #include // rowId supports up to 65535, timestamp uses 48 bits @@ -65,13 +66,13 @@ struct VersionedData : public pixels::RetinaBase> { uint64_t baseTimestamp; DeleteIndexBlock* head; // Delete chain head, part of the version - VersionedData() : baseTimestamp(0), head(nullptr) { - std::memset(baseBitmap, 0, sizeof(baseBitmap)); - } - - VersionedData(uint64_t ts, const uint64_t* bitmap, DeleteIndexBlock* h) + // timestamp defaults to 0; bitmap defaults to all-zeros. + explicit VersionedData(uint64_t ts = 0, const uint64_t* bitmap = nullptr, DeleteIndexBlock* h = nullptr) : baseTimestamp(ts), head(h) { - std::memcpy(baseBitmap, bitmap, NUM_WORDS * sizeof(uint64_t)); + if (bitmap) + std::memcpy(baseBitmap, bitmap, NUM_WORDS * sizeof(uint64_t)); + else + std::memset(baseBitmap, 0, sizeof(baseBitmap)); } }; @@ -92,12 +93,15 @@ template class TileVisibility : public pixels::RetinaBase> { static constexpr size_t NUM_WORDS = BITMAP_WORDS(CAPACITY); public: - TileVisibility(); - TileVisibility(uint64_t ts, const uint64_t* bitmap); + // timestamp defaults to 0; bitmap defaults to all-zeros. + explicit TileVisibility(uint64_t timestamp = 0, const uint64_t* bitmap = nullptr); ~TileVisibility() override; void deleteTileRecord(uint16_t rowId, uint64_t ts); void getTileVisibilityBitmap(uint64_t ts, uint64_t* outBitmap) const; - void collectTileGarbage(uint64_t ts); + void collectTileGarbage(uint64_t ts, uint64_t* gcSnapshotBitmap); + void exportChainItemsAfter(uint32_t tileId, uint64_t safeGcTs, + std::vector>& gcChainItems) const; + void importDeletionItems(std::vector& bucket); private: TileVisibility(const TileVisibility &) = delete; @@ -108,7 +112,22 @@ class TileVisibility : public pixels::RetinaBase> { std::atomic*> currentVersion; std::atomic tail; std::atomic tailUsed; - std::vector> retired; // Protected by GC (single writer) + + // Retired versions awaiting epoch-based reclamation. Only the GC thread + // (collectTileGarbage / reclaimRetiredVersions) reads and writes this vector, + // so no locking is needed. + std::vector> retired; + + // Lock-free staging slot between deleteTileRecord (CDC threads) and GC. + // deleteTileRecord's empty-chain path replaces currentVersion but cannot + // write `retired` directly — that would race with the GC thread. Instead + // it atomically stores oldVer here. The GC thread drains this slot at the + // start of collectTileGarbage, moving it into `retired` with a proper epoch. + // Flow: deleteTileRecord → pendingRetire.store → collectTileGarbage → + // pendingRetire.exchange(nullptr) → retired.emplace_back → reclaimRetiredVersions + // At most one version is pending per GC cycle (the empty-chain path fires + // at most once between consecutive GC compactions). + std::atomic*> pendingRetire{nullptr}; }; #endif // PIXELS_RETINA_TILE_VISIBILITY_H diff --git a/cpp/pixels-retina/lib/RGVisibility.cpp b/cpp/pixels-retina/lib/RGVisibility.cpp index 47c2107027..d1609535f0 100644 --- a/cpp/pixels-retina/lib/RGVisibility.cpp +++ b/cpp/pixels-retina/lib/RGVisibility.cpp @@ -22,35 +22,23 @@ #include #include +// Validates before allocation: any throw leaves tileVisibilities as nullptr, +// so the incomplete constructor does not invoke the destructor (no memory leak). template -RGVisibility::RGVisibility(uint64_t rgRecordNum) - : tileCount((rgRecordNum + VISIBILITY_RECORD_CAPACITY - 1) / VISIBILITY_RECORD_CAPACITY) { - size_t allocSize = tileCount * sizeof(TileVisibility); - void* rawMemory = operator new[](allocSize); - tileVisibilities = static_cast*>(rawMemory); - for (uint64_t i = 0; i < tileCount; ++i) { - new (&tileVisibilities[i]) TileVisibility(); - } -} +RGVisibility::RGVisibility(uint64_t rgRecordNum, uint64_t timestamp, + const std::vector* initialBitmap) + : tileCount((rgRecordNum + VISIBILITY_RECORD_CAPACITY - 1) / VISIBILITY_RECORD_CAPACITY), + tileVisibilities(nullptr) { + if (initialBitmap && initialBitmap->size() < tileCount * BITMAP_SIZE_PER_TILE_VISIBILITY) + throw std::invalid_argument("Initial bitmap size is too small for the given record number."); -template -RGVisibility::RGVisibility(uint64_t rgRecordNum, uint64_t timestamp, const std::vector& initialBitmap) - : tileCount((rgRecordNum + VISIBILITY_RECORD_CAPACITY - 1) / VISIBILITY_RECORD_CAPACITY) { - size_t allocSize = tileCount * sizeof(TileVisibility); - void* rawMemory = operator new[](allocSize); + tileVisibilities = static_cast*>( + operator new[](tileCount * sizeof(TileVisibility))); - if (initialBitmap.size() < tileCount * BITMAP_SIZE_PER_TILE_VISIBILITY) { - operator delete[](rawMemory); - throw std::runtime_error("Initial bitmap size is too small for the given record number."); - } - - tileVisibilities = static_cast*>(rawMemory); - for (uint64_t i = 0; i < tileCount; ++i) { - // Each tile takes 4 uint64_t - const uint64_t* tileBitmap = &initialBitmap[i * BITMAP_SIZE_PER_TILE_VISIBILITY]; - // We use timestamp 0 for restored checkpoints to serve as the base state - new (&tileVisibilities[i]) TileVisibility(timestamp, tileBitmap); - } + for (uint64_t i = 0; i < tileCount; ++i) + new (&tileVisibilities[i]) TileVisibility( + timestamp, + initialBitmap ? initialBitmap->data() + i * BITMAP_SIZE_PER_TILE_VISIBILITY : nullptr); } template @@ -62,11 +50,14 @@ RGVisibility::~RGVisibility() { } template -void RGVisibility::collectRGGarbage(uint64_t timestamp) { -// TileVisibility::collectTileGarbage uses COW + Epoch, so it's safe to call concurrently - for (uint64_t i = 0; i < tileCount; i++) { - tileVisibilities[i].collectTileGarbage(timestamp); +std::vector RGVisibility::collectRGGarbage(uint64_t timestamp) { + size_t totalWords = tileCount * BITMAP_SIZE_PER_TILE_VISIBILITY; + std::vector rgSnapshot(totalWords, 0); + for (uint32_t t = 0; t < tileCount; t++) { + tileVisibilities[t].collectTileGarbage(timestamp, + rgSnapshot.data() + t * BITMAP_SIZE_PER_TILE_VISIBILITY); } + return rgSnapshot; } template @@ -104,5 +95,32 @@ uint64_t RGVisibility::getBitmapSize() const { return tileCount * BITMAP_SIZE_PER_TILE_VISIBILITY; } +template +std::vector RGVisibility::exportChainItemsAfter(uint64_t safeGcTs) const { + std::vector> items; + for (uint32_t t = 0; t < tileCount; t++) + tileVisibilities[t].exportChainItemsAfter(t, safeGcTs, items); + std::vector result; + result.reserve(items.size() * 2); + for (auto& [off, ts] : items) { result.push_back(off); result.push_back(ts); } + return result; +} + +template +void RGVisibility::importDeletionChain(const uint64_t* items, size_t pairCount) { + std::vector> tileBuckets(tileCount); + for (size_t i = 0; i < pairCount; i++) { + uint32_t rgRowOffset = static_cast(items[2 * i]); + uint64_t ts = items[2 * i + 1]; + uint32_t tileId = rgRowOffset / CAPACITY; + uint16_t localRowId = static_cast(rgRowOffset % CAPACITY); + tileBuckets[tileId].push_back(makeDeleteIndex(localRowId, ts)); + } + for (uint32_t t = 0; t < tileCount; t++) { + if (tileBuckets[t].empty()) continue; + tileVisibilities[t].importDeletionItems(tileBuckets[t]); + } +} + // Explicit Instantiations for JNI use template class RGVisibility; diff --git a/cpp/pixels-retina/lib/RGVisibilityJni.cpp b/cpp/pixels-retina/lib/RGVisibilityJni.cpp index ff655f8a79..fdcbeaa328 100644 --- a/cpp/pixels-retina/lib/RGVisibilityJni.cpp +++ b/cpp/pixels-retina/lib/RGVisibilityJni.cpp @@ -26,40 +26,28 @@ /* * Class: io_pixelsdb_pixels_retina_RGVisibility * Method: createNativeObject - * Signature: (J)J - */ -JNIEXPORT jlong JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_createNativeObject - (JNIEnv* env, jobject, jlong rgRecordNum) { - try { - auto* rgVisibility = new RGVisibilityInstance(rgRecordNum); - return reinterpret_cast(rgVisibility); - } catch (const std::exception& e) { - env->ThrowNew(env->FindClass("java/lang/RuntimeException"), e.what()); - return 0; - } -} - -/* - * Class: io_pixelsdb_pixels_retina_RGVisibility - * Method: createNativeObjectInitialized * Signature: (JJ[J)J + * + * Converts the Java bitmap array to a native vector when present, then + * forwards to a single RGVisibility constructor call. */ -JNIEXPORT jlong JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_createNativeObjectInitialized +JNIEXPORT jlong JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_createNativeObject (JNIEnv* env, jobject, jlong rgRecordNum, jlong timestamp, jlongArray bitmap) { try { - jsize len = env->GetArrayLength(bitmap); - jlong *body = env->GetLongArrayElements(bitmap, nullptr); - std::vector bitmapData; - bitmapData.reserve(len); - for (int i = 0; i < len; i++) { - bitmapData.push_back((uint64_t)body[i]); + const std::vector* bitmapPtr = nullptr; + if (bitmap != nullptr) { + jsize len = env->GetArrayLength(bitmap); + jlong* body = env->GetLongArrayElements(bitmap, nullptr); + bitmapData.assign(reinterpret_cast(body), + reinterpret_cast(body) + len); + env->ReleaseLongArrayElements(bitmap, body, JNI_ABORT); + bitmapPtr = &bitmapData; } - - env->ReleaseLongArrayElements(bitmap, body, JNI_ABORT); - - RGVisibilityInstance *rgVisibility = new RGVisibilityInstance(rgRecordNum, timestamp, bitmapData); - return reinterpret_cast(rgVisibility); + return reinterpret_cast(new RGVisibilityInstance( + static_cast(rgRecordNum), + static_cast(timestamp), + bitmapPtr)); } catch (const std::exception& e) { env->ThrowNew(env->FindClass("java/lang/RuntimeException"), e.what()); return 0; @@ -129,13 +117,55 @@ JNIEXPORT jlongArray JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_getVisi /* * Class: io_pixelsdb_pixels_retina_RGVisibility * Method: garbageCollect - * Signature: (JJ)V + * Signature: (JJ)[J */ -JNIEXPORT void JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_garbageCollect +JNIEXPORT jlongArray JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_garbageCollect (JNIEnv* env, jobject, jlong timestamp, jlong handle) { try { auto* rgVisibility = reinterpret_cast(handle); - rgVisibility->collectRGGarbage(timestamp); + std::vector snapshot = rgVisibility->collectRGGarbage(timestamp); + jlongArray result = env->NewLongArray(snapshot.size()); + env->SetLongArrayRegion(result, 0, snapshot.size(), + reinterpret_cast(snapshot.data())); + return result; + } catch (const std::exception& e) { + env->ThrowNew(env->FindClass("java/lang/RuntimeException"), e.what()); + return nullptr; + } +} + +/* + * Class: io_pixelsdb_pixels_retina_RGVisibility + * Method: exportChainItemsAfter + * Signature: (JJ)[J + */ +JNIEXPORT jlongArray JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_exportChainItemsAfter + (JNIEnv* env, jobject, jlong safeGcTs, jlong handle) { + try { + auto* rgVis = reinterpret_cast(handle); + std::vector items = rgVis->exportChainItemsAfter(static_cast(safeGcTs)); + jlongArray result = env->NewLongArray(items.size()); + env->SetLongArrayRegion(result, 0, items.size(), reinterpret_cast(items.data())); + return result; + } catch (const std::exception& e) { + env->ThrowNew(env->FindClass("java/lang/RuntimeException"), e.what()); + return nullptr; + } +} + +/* + * Class: io_pixelsdb_pixels_retina_RGVisibility + * Method: importDeletionChain + * Signature: ([JJ)V + */ +JNIEXPORT void JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_importDeletionChain + (JNIEnv* env, jobject, jlongArray items, jlong handle) { + try { + auto* rgVis = reinterpret_cast(handle); + jsize len = env->GetArrayLength(items); + jlong* body = env->GetLongArrayElements(items, nullptr); + rgVis->importDeletionChain(reinterpret_cast(body), len / 2); + env->ReleaseLongArrayElements(items, body, JNI_ABORT); } catch (const std::exception& e) { env->ThrowNew(env->FindClass("java/lang/RuntimeException"), e.what()); } @@ -190,6 +220,5 @@ JNIEXPORT jlong JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_getRetinaTra */ JNIEXPORT jlong JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_getRetinaObjectCount (JNIEnv *env, jclass clazz) { - // Read the atomic object counter from RetinaBase namespace return static_cast(pixels::g_retina_object_count.load(std::memory_order_relaxed)); } diff --git a/cpp/pixels-retina/lib/TileVisibility.cpp b/cpp/pixels-retina/lib/TileVisibility.cpp index 1123032e20..f4fcdcb429 100644 --- a/cpp/pixels-retina/lib/TileVisibility.cpp +++ b/cpp/pixels-retina/lib/TileVisibility.cpp @@ -21,32 +21,18 @@ #include "TileVisibility.h" #include "EpochManager.h" +#include +#include +#include #include #include #include -#include "TileVisibility.h" -#include "EpochManager.h" -#include -#include -#include - template -TileVisibility::TileVisibility() { - VersionedData* initialVersion = new VersionedData(); - currentVersion.store(initialVersion, std::memory_order_release); - tail.store(nullptr, std::memory_order_release); - tailUsed.store(0, std::memory_order_release); -} - -template -TileVisibility::TileVisibility(uint64_t ts, const uint64_t* bitmap) { - VersionedData* initialVersion = new VersionedData(ts, bitmap, nullptr); - currentVersion.store(initialVersion, std::memory_order_release); - tail.store(nullptr, std::memory_order_release); - tailUsed.store(0, std::memory_order_release); -} +TileVisibility::TileVisibility(uint64_t timestamp, const uint64_t* bitmap) + : currentVersion(new VersionedData(timestamp, bitmap)), + tail(nullptr), tailUsed(0) {} template TileVisibility::~TileVisibility() { @@ -61,6 +47,12 @@ TileVisibility::~TileVisibility() { delete ver; } + // Clean up any version left in the pending retirement slot + VersionedData* pending = pendingRetire.load(std::memory_order_acquire); + if (pending) { + delete pending; + } + // Clean up retired versions and their delete chains for (auto& retired : this->retired) { if (retired.data) { @@ -97,8 +89,9 @@ void TileVisibility::deleteTileRecord(uint16_t rowId, uint64_t ts) { VersionedData* newVer = new VersionedData(oldVer->baseTimestamp, oldVer->baseBitmap, newBlk); if (currentVersion.compare_exchange_strong(oldVer, newVer, std::memory_order_acq_rel)) { - // Success: retire old version (no chain to delete since head was nullptr) - delete oldVer; + // Defer retirement: a concurrent reader may still hold oldVer under EpochGuard. + // collectTileGarbage will drain this slot and epoch-retire it properly. + pendingRetire.store(oldVer, std::memory_order_release); tailUsed.store(1, std::memory_order_release); return; } else { @@ -199,14 +192,33 @@ void TileVisibility::getTileVisibilityBitmap(uint64_t ts, uint64_t* ou size_t currentTailUsed = tailUsed.load(std::memory_order_relaxed); size_t count = (blk == currentTail) ? currentTailUsed : DeleteIndexBlock::BLOCK_CAPACITY; + // Same tail/tailUsed race as in collectTileGarbage: count may be 0 or + // a stale BLOCK_CAPACITY for a newly-created tail block. count == 0 + // means no items to read; skip cleanly. The stale-count case (items + // beyond the first being zero-initialised) is handled in the scalar + // path below via the item == 0 sentinel check. + if (count == 0) { + blk = blk->next.load(std::memory_order_relaxed); + continue; + } + uint64_t i = 0; #ifdef RETINA_SIMD + // NOTE: the SIMD path does not check for zero-initialised (item == 0) + // sentinel values. In the extremely rare stale-tailUsed race window, + // up to BLOCK_CAPACITY-1 zero items may cause row 0 to be transiently + // marked as deleted in the output bitmap. This is a known limitation + // of the SIMD fast path; the effect is transient (not persisted) and + // self-correcting on the next query once tailUsed is fully updated. for (; i + 4 <= count; i += 4) { process_bitmap_block_256(blk, i, outBitmap, vThrFlip, tsMask, signBit); } #endif for (; i < count; i++) { uint64_t item = blk->items[i]; + // Sentinel: zero item signals an uninitialised slot (see + // collectTileGarbage for the full race description). + if (item == 0) return; if (extractTimestamp(item) <= ts) { SET_BITMAP_BIT(outBitmap, extractRowId(item)); } else { @@ -218,20 +230,44 @@ void TileVisibility::getTileVisibilityBitmap(uint64_t ts, uint64_t* ou } template -void TileVisibility::collectTileGarbage(uint64_t ts) { +void TileVisibility::collectTileGarbage(uint64_t ts, uint64_t* gcSnapshotBitmap) { + // Drain the pending retirement slot left by deleteTileRecord's empty-chain path. + VersionedData* pending = pendingRetire.exchange(nullptr, std::memory_order_acquire); + if (pending) { + uint64_t retireEpoch = EpochManager::getInstance().advanceEpoch(); + retired.emplace_back(pending, nullptr, retireEpoch); + } + // Load old version VersionedData* oldVer = currentVersion.load(std::memory_order_acquire); - if (ts <= oldVer->baseTimestamp) return; - // Find the last block that should be compacted + // Early return A: safeGcTs <= baseTimestamp, nothing to compact + if (ts <= oldVer->baseTimestamp) { + std::memcpy(gcSnapshotBitmap, oldVer->baseBitmap, NUM_WORDS * sizeof(uint64_t)); + return; + } + + // Find the last block that should be compacted. + // Snapshot tail/tailUsed once and reuse in both the scan loop and the + // compact loop to guarantee a consistent view of the chain endpoint. DeleteIndexBlock *blk = oldVer->head; DeleteIndexBlock *lastFullBlk = nullptr; uint64_t newBaseTimestamp = oldVer->baseTimestamp; + auto* tailSnap1 = tail.load(std::memory_order_acquire); + size_t tailUsedSnap1 = tailUsed.load(std::memory_order_acquire); while (blk) { - size_t count = (blk == tail.load(std::memory_order_acquire)) - ? tailUsed.load(std::memory_order_acquire) + size_t count = (blk == tailSnap1) + ? tailUsedSnap1 : DeleteIndexBlock::BLOCK_CAPACITY; + // Guard: deleteTileRecord updates `tail` and `tailUsed` non-atomically. + // In the narrow window after `tail` is advanced to a new block but before + // `tailUsed.store(1)` completes, we may observe count == 0 (empty-list + // path: tailUsed transitions 0 → 1) or a stale BLOCK_CAPACITY (full-block + // path: tailUsed transitions BLOCK_CAPACITY → 1 via store, not CAS). + // When count == 0 there is nothing to compact; stop here and let the next + // GC cycle handle the block once it is fully initialised. + if (count == 0) break; uint64_t lastItemTs = extractTimestamp(blk->items[count - 1]); if (lastItemTs <= ts) { lastFullBlk = blk; @@ -240,7 +276,22 @@ void TileVisibility::collectTileGarbage(uint64_t ts) { blk = blk->next.load(std::memory_order_acquire); } - if (!lastFullBlk) return; + // Early return B: no compactable block + if (!lastFullBlk) { + std::memcpy(gcSnapshotBitmap, oldVer->baseBitmap, NUM_WORDS * sizeof(uint64_t)); + if (oldVer->head) { + auto* tailSnap = tail.load(std::memory_order_acquire); + size_t tailUsedSnap = tailUsed.load(std::memory_order_acquire); + size_t cnt = (oldVer->head == tailSnap) ? tailUsedSnap : DeleteIndexBlock::BLOCK_CAPACITY; + for (size_t i = 0; i < cnt; i++) { + uint64_t item = oldVer->head->items[i]; + if (item == 0) break; + if (extractTimestamp(item) <= ts) SET_BITMAP_BIT(gcSnapshotBitmap, extractRowId(item)); + else break; + } + } + return; + } // Create new version with Copy-on-Write // Manually compute the new base bitmap from oldVer @@ -250,9 +301,17 @@ void TileVisibility::collectTileGarbage(uint64_t ts) { // Apply deletes from oldVer->head up to lastFullBlk blk = oldVer->head; while (blk) { - size_t count = (blk == lastFullBlk && blk == tail.load()) ? tailUsed.load() : DeleteIndexBlock::BLOCK_CAPACITY; + size_t count = (blk == lastFullBlk && blk == tailSnap1) ? tailUsedSnap1 : DeleteIndexBlock::BLOCK_CAPACITY; for (size_t i = 0; i < count; i++) { uint64_t item = blk->items[i]; + // Guard: a zero item means an uninitialised slot in a newly-created + // tail block observed under the same tail/tailUsed race described + // above (full-block path: tailUsed is still BLOCK_CAPACITY while + // only items[0] is valid; items[1..n] remain zero-initialised). + // item == 0 encodes makeDeleteIndex(rowId=0, ts=0); since all valid + // transaction timestamps are > 0, this value is never a legitimate + // deletion record and safely identifies the end of valid items. + if (item == 0) break; if (extractTimestamp(item) <= ts) { SET_BITMAP_BIT(newBaseBitmap, extractRowId(item)); } @@ -261,8 +320,22 @@ void TileVisibility::collectTileGarbage(uint64_t ts) { blk = blk->next.load(std::memory_order_acquire); } - // Get new head and break the chain to avoid double-free + // Compact path: build gcSnapshotBitmap by scanning the boundary block. + // Reuse the same tail/tailUsed snapshot (tailSnap1/tailUsedSnap1) taken at + // the start of this GC cycle to ensure consistent chain-end semantics. DeleteIndexBlock* newHead = lastFullBlk->next.load(std::memory_order_acquire); + std::memcpy(gcSnapshotBitmap, newBaseBitmap, NUM_WORDS * sizeof(uint64_t)); + if (newHead) { + size_t cnt = (newHead == tailSnap1) ? tailUsedSnap1 : DeleteIndexBlock::BLOCK_CAPACITY; + for (size_t i = 0; i < cnt; i++) { + uint64_t item = newHead->items[i]; + if (item == 0) break; + if (extractTimestamp(item) <= ts) SET_BITMAP_BIT(gcSnapshotBitmap, extractRowId(item)); + else break; + } + } + + // Break the chain to avoid double-free lastFullBlk->next.store(nullptr, std::memory_order_release); // Create new version with new head - this is the atomic COW update @@ -314,5 +387,108 @@ void TileVisibility::reclaimRetiredVersions() { } } +template +void TileVisibility::exportChainItemsAfter( + uint32_t tileId, uint64_t safeGcTs, + std::vector>& gcChainItems) const { + auto* ver = currentVersion.load(std::memory_order_acquire); + auto* tailSnap = tail.load(std::memory_order_acquire); + size_t tailUsedSnap = tailUsed.load(std::memory_order_acquire); + + auto* blk = ver->head; + bool pastBoundary = false; + while (blk != nullptr) { + size_t count = (blk == tailSnap) ? tailUsedSnap : DeleteIndexBlock::BLOCK_CAPACITY; + for (size_t i = 0; i < count; i++) { + uint64_t item = blk->items[i]; + if (item == 0) return; + if (pastBoundary) { + uint32_t rgOffset = tileId * CAPACITY + extractRowId(item); + gcChainItems.push_back({rgOffset, extractTimestamp(item)}); + } else { + uint64_t ts = extractTimestamp(item); + if (ts > safeGcTs) { + pastBoundary = true; + uint32_t rgOffset = tileId * CAPACITY + extractRowId(item); + gcChainItems.push_back({rgOffset, ts}); + } + } + } + if (blk == tailSnap) return; + blk = blk->next.load(std::memory_order_acquire); + } +} + +template +void TileVisibility::importDeletionItems(std::vector& bucket) { + std::sort(bucket.begin(), bucket.end(), [](uint64_t a, uint64_t b) { + return extractTimestamp(a) < extractTimestamp(b); + }); + + bool tailClaimed = false; + while (true) { + auto* ver = currentVersion.load(std::memory_order_acquire); + + uint64_t ts_head = UINT64_MAX; + if (ver->head != nullptr) { + uint64_t firstItem = ver->head->items[0]; + if (firstItem != 0) ts_head = extractTimestamp(firstItem); + } + + size_t keepCount = bucket.size(); + if (ts_head != UINT64_MAX) { + keepCount = std::upper_bound(bucket.begin(), bucket.end(), ts_head, + [](uint64_t val, uint64_t item) { + return val < extractTimestamp(item); + }) - bucket.begin(); + } + if (keepCount == 0) return; + + uint64_t lastValidItem = bucket[keepCount - 1]; + std::vector blocks; + for (size_t i = 0; i < keepCount; i += DeleteIndexBlock::BLOCK_CAPACITY) { + auto* blk = new DeleteIndexBlock(); + for (size_t j = 0; j < DeleteIndexBlock::BLOCK_CAPACITY; j++) { + size_t idx = i + j; + blk->items[j] = (idx < keepCount) ? bucket[idx] : lastValidItem; + } + blocks.push_back(blk); + } + for (size_t i = 0; i + 1 < blocks.size(); i++) + blocks[i]->next.store(blocks[i + 1], std::memory_order_release); + blocks.back()->next.store(ver->head, std::memory_order_release); + + if (ver->head == nullptr && !tailClaimed) { + size_t lastBlockItems = keepCount % DeleteIndexBlock::BLOCK_CAPACITY; + if (lastBlockItems == 0) lastBlockItems = DeleteIndexBlock::BLOCK_CAPACITY; + + DeleteIndexBlock* expectedTail = nullptr; + if (tail.compare_exchange_strong(expectedTail, blocks.back(), + std::memory_order_release, std::memory_order_relaxed)) { + tailUsed.store(lastBlockItems, std::memory_order_release); + tailClaimed = true; + } else { + for (auto* blk : blocks) delete blk; + continue; + } + } + + auto* newVer = new VersionedData(ver->baseTimestamp, ver->baseBitmap, blocks[0]); + + if (currentVersion.compare_exchange_strong(ver, newVer, std::memory_order_acq_rel)) { + uint64_t retireEpoch = EpochManager::getInstance().advanceEpoch(); + retired.emplace_back(ver, nullptr, retireEpoch); + reclaimRetiredVersions(); + return; + } + if (tailClaimed) { + std::fprintf(stderr, "importDeletionItems: CAS failed with tailClaimed — invariant violation\n"); + std::abort(); + } + delete newVer; + for (auto* blk : blocks) delete blk; + } +} + // Explicit Instantiations (Add the sizes you need here) -template class TileVisibility; +template class TileVisibility; \ No newline at end of file diff --git a/cpp/pixels-retina/test/RGVisibilityTest.cpp b/cpp/pixels-retina/test/RGVisibilityTest.cpp index b1e6daf633..8d8b135eee 100644 --- a/cpp/pixels-retina/test/RGVisibilityTest.cpp +++ b/cpp/pixels-retina/test/RGVisibilityTest.cpp @@ -235,4 +235,439 @@ TEST_F(RGVisibilityTest, MultiThread) { delete[] finalBitmap; delete[] expectedFinalBitmap; +} + +// ===================================================================== +// gcSnapshotBitmap correctness tests +// +// Core verification: the bitmap returned by collectRGGarbage (gcSnapshotBitmap) +// must be bitwise identical to getRGVisibilityBitmap called BEFORE GC. +// +// Why pre-GC reference matters: +// getRGVisibilityBitmap traverses the full, unmodified deletion chain — it is +// a completely independent computation from the GC code path. Comparing +// gcSnapshotBitmap with a post-GC getRGVisibilityBitmap is weaker because +// both read from state that GC just modified; a bug that corrupts the compact +// AND the snapshot identically would go undetected. +// +// Each test also verifies that post-GC queries still return correct results +// (regression check on the compact logic itself). +// +// Covers all three code paths in collectTileGarbage: +// A — ts <= baseTimestamp (early return, no compaction) +// B — chain exists but no full block compactable +// C — one or more blocks compacted (with/without boundary block) +// ===================================================================== + +static void compareBitmaps( + const uint64_t* actual, const uint64_t* expected, uint64_t size, uint64_t ts, + const char* actualLabel, const char* expectedLabel) +{ + for (size_t i = 0; i < size; i++) { + EXPECT_EQ(actual[i], expected[i]) + << "Word " << i << " (rows " << (i * 64) << "-" << (i * 64 + 63) + << ") at ts=" << ts + << "\n " << actualLabel << ": " << std::bitset<64>(actual[i]) + << "\n " << expectedLabel << ": " << std::bitset<64>(expected[i]); + } +} + +static void verifyGcSnapshot( + RGVisibilityInstance* rgv, uint64_t ts, + const uint64_t* preGcRef, const std::vector& snapshot) +{ + uint64_t bitmapSize = rgv->getBitmapSize(); + ASSERT_EQ(snapshot.size(), bitmapSize); + + // Primary check: gcSnapshotBitmap must match the pre-GC ground truth + compareBitmaps(snapshot.data(), preGcRef, bitmapSize, ts, + "gcSnapshot", "preGcRef"); + + // Secondary check: post-GC query must also agree (compact regression) + uint64_t* postGcRef = rgv->getRGVisibilityBitmap(ts); + compareBitmaps(snapshot.data(), postGcRef, bitmapSize, ts, + "gcSnapshot", "postGcQuery"); + delete[] postGcRef; +} + +// Path A: empty chain → all-zero snapshot; then repeat GC at same ts → early return A +TEST_F(RGVisibilityTest, GcSnapshot_EarlyReturnA) { + // Empty chain: baseTimestamp=0, ts=100 > 0 → enters path B with null head + uint64_t* preRef0 = rgVisibility->getRGVisibilityBitmap(100); + std::vector snap0 = rgVisibility->collectRGGarbage(100); + verifyGcSnapshot(rgVisibility, 100, preRef0, snap0); + delete[] preRef0; + for (auto w : snap0) { + EXPECT_EQ(w, 0ULL); + } + + // Add deletes and compact to advance baseTimestamp + rgVisibility->deleteRGRecord(5, 100); + rgVisibility->deleteRGRecord(10, 100); + rgVisibility->deleteRGRecord(15, 200); + + // First GC at ts=200 → compact all 3 items → baseTimestamp becomes 200 + uint64_t* preRef1 = rgVisibility->getRGVisibilityBitmap(200); + std::vector snap1 = rgVisibility->collectRGGarbage(200); + verifyGcSnapshot(rgVisibility, 200, preRef1, snap1); + delete[] preRef1; + + // Second GC at ts=200 → ts == baseTimestamp → true early return A + uint64_t* preRef2 = rgVisibility->getRGVisibilityBitmap(200); + std::vector snap2 = rgVisibility->collectRGGarbage(200); + verifyGcSnapshot(rgVisibility, 200, preRef2, snap2); + delete[] preRef2; + + ASSERT_EQ(snap1.size(), snap2.size()); + for (size_t i = 0; i < snap1.size(); i++) { + EXPECT_EQ(snap1[i], snap2[i]); + } +} + +// Path B: chain exists, head block straddles safeGcTs → no compactable block +TEST_F(RGVisibilityTest, GcSnapshot_EarlyReturnB) { + // 5 items in one block: ts 1,2,3,8,10. Block last ts=10 > safeGcTs=5 + rgVisibility->deleteRGRecord(0, 1); + rgVisibility->deleteRGRecord(1, 2); + rgVisibility->deleteRGRecord(2, 3); + rgVisibility->deleteRGRecord(3, 8); + rgVisibility->deleteRGRecord(4, 10); + + uint64_t* preRef = rgVisibility->getRGVisibilityBitmap(5); + std::vector snapshot = rgVisibility->collectRGGarbage(5); + verifyGcSnapshot(rgVisibility, 5, preRef, snapshot); + delete[] preRef; + + // Rows 0,1,2 marked (ts ≤ 5); rows 3,4 not (ts 8,10 > 5) + EXPECT_EQ(snapshot[0], 0b111ULL); +} + +// Path B variant: all items in head block have ts > safeGcTs +TEST_F(RGVisibilityTest, GcSnapshot_EarlyReturnB_NoneMatch) { + rgVisibility->deleteRGRecord(0, 10); + rgVisibility->deleteRGRecord(1, 20); + + uint64_t* preRef = rgVisibility->getRGVisibilityBitmap(5); + std::vector snapshot = rgVisibility->collectRGGarbage(5); + verifyGcSnapshot(rgVisibility, 5, preRef, snapshot); + delete[] preRef; + + EXPECT_EQ(snapshot[0], 0ULL); +} + +// Path C: one full block compacted + boundary block with mixed items +TEST_F(RGVisibilityTest, GcSnapshot_CompactWithBoundary) { + // 10 items: rows 0-9, ts 1-10 + // Block 1 (8 items, ts 1-8): last ts=8 ≤ 9 → compactable + // Block 2 (2 items, ts 9-10): boundary block + for (uint32_t i = 0; i < 10; i++) { + rgVisibility->deleteRGRecord(i, i + 1); + } + + uint64_t* preRef = rgVisibility->getRGVisibilityBitmap(9); + std::vector snapshot = rgVisibility->collectRGGarbage(9); + verifyGcSnapshot(rgVisibility, 9, preRef, snapshot); + delete[] preRef; + + // Rows 0-8 marked (ts 1-9 ≤ 9), row 9 not (ts 10 > 9) + EXPECT_EQ(snapshot[0], 0x1FFULL); // bits 0-8 +} + +// Path C: all blocks fully compacted, no remaining chain +TEST_F(RGVisibilityTest, GcSnapshot_CompactAllBlocks) { + // Exactly 8 items fill one block: rows 0-7, ts 1-8 + for (uint32_t i = 0; i < 8; i++) { + rgVisibility->deleteRGRecord(i, i + 1); + } + + // safeGcTs=10 > all item ts → entire block compacted, newHead=null + uint64_t* preRef = rgVisibility->getRGVisibilityBitmap(10); + std::vector snapshot = rgVisibility->collectRGGarbage(10); + verifyGcSnapshot(rgVisibility, 10, preRef, snapshot); + delete[] preRef; + + EXPECT_EQ(snapshot[0], 0xFFULL); // bits 0-7 +} + +// Path C: multiple blocks compacted before a boundary block +TEST_F(RGVisibilityTest, GcSnapshot_CompactMultiBlock) { + // 20 items: rows 0-19, ts 1-20 + // Block 1 (ts 1-8), Block 2 (ts 9-16), Block 3 tail (ts 17-20) + // safeGcTs=18: blocks 1,2 compacted, block 3 is boundary + for (uint32_t i = 0; i < 20; i++) { + rgVisibility->deleteRGRecord(i, i + 1); + } + + uint64_t* preRef = rgVisibility->getRGVisibilityBitmap(18); + std::vector snapshot = rgVisibility->collectRGGarbage(18); + verifyGcSnapshot(rgVisibility, 18, preRef, snapshot); + delete[] preRef; + + // Rows 0-17 marked (ts 1-18 ≤ 18), rows 18-19 not + EXPECT_EQ(snapshot[0], (1ULL << 18) - 1); +} + +// Multiple deletes sharing the same timestamp (batch deletes) +TEST_F(RGVisibilityTest, GcSnapshot_SameTimestamp) { + rgVisibility->deleteRGRecord(0, 5); + rgVisibility->deleteRGRecord(1, 5); + rgVisibility->deleteRGRecord(2, 5); + rgVisibility->deleteRGRecord(3, 10); + rgVisibility->deleteRGRecord(4, 10); + + uint64_t* preRef = rgVisibility->getRGVisibilityBitmap(5); + std::vector snapshot = rgVisibility->collectRGGarbage(5); + verifyGcSnapshot(rgVisibility, 5, preRef, snapshot); + delete[] preRef; + + EXPECT_EQ(snapshot[0], 0b111ULL); +} + +// Deletes spanning multiple tiles (RETINA_CAPACITY=256 rows per tile) +TEST_F(RGVisibilityTest, GcSnapshot_CrossTile) { + // Tile 0: rows 0-255 Tile 1: rows 256-511 + // Tile 2: rows 512-767 + rgVisibility->deleteRGRecord(5, 1); // tile 0 + rgVisibility->deleteRGRecord(10, 2); // tile 0 + rgVisibility->deleteRGRecord(260, 3); // tile 1, localRow 4 + rgVisibility->deleteRGRecord(600, 4); // tile 2, localRow 88 + rgVisibility->deleteRGRecord(100, 5); // tile 0 + rgVisibility->deleteRGRecord(300, 6); // tile 1, localRow 44 + + uint64_t* preRef1 = rgVisibility->getRGVisibilityBitmap(4); + std::vector snapshot = rgVisibility->collectRGGarbage(4); + verifyGcSnapshot(rgVisibility, 4, preRef1, snapshot); + delete[] preRef1; + + // After GC at ts=4, also verify a higher ts sees more deletes + uint64_t* preRef2 = rgVisibility->getRGVisibilityBitmap(6); + std::vector snap2 = rgVisibility->collectRGGarbage(6); + verifyGcSnapshot(rgVisibility, 6, preRef2, snap2); + delete[] preRef2; +} + +// Progressive GC rounds with interleaved inserts +TEST_F(RGVisibilityTest, GcSnapshot_ProgressiveRounds) { + // Phase 1: 20 deletes at ts 1-20 + for (uint32_t i = 0; i < 20; i++) { + rgVisibility->deleteRGRecord(i, i + 1); + } + + uint64_t* preRef1 = rgVisibility->getRGVisibilityBitmap(5); + std::vector snap1 = rgVisibility->collectRGGarbage(5); + verifyGcSnapshot(rgVisibility, 5, preRef1, snap1); + delete[] preRef1; + + uint64_t* preRef2 = rgVisibility->getRGVisibilityBitmap(12); + std::vector snap2 = rgVisibility->collectRGGarbage(12); + verifyGcSnapshot(rgVisibility, 12, preRef2, snap2); + delete[] preRef2; + + // Phase 2: 10 more deletes at ts 21-30 + for (uint32_t i = 20; i < 30; i++) { + rgVisibility->deleteRGRecord(i, i + 1); + } + + uint64_t* preRef3 = rgVisibility->getRGVisibilityBitmap(25); + std::vector snap3 = rgVisibility->collectRGGarbage(25); + verifyGcSnapshot(rgVisibility, 25, preRef3, snap3); + delete[] preRef3; + + // Final GC beyond all timestamps + uint64_t* preRef4 = rgVisibility->getRGVisibilityBitmap(100); + std::vector snap4 = rgVisibility->collectRGGarbage(100); + verifyGcSnapshot(rgVisibility, 100, preRef4, snap4); + delete[] preRef4; + + // All 30 rows should be marked + EXPECT_EQ(snap4[0], (1ULL << 30) - 1); +} + +// Randomized: random deletes across all tiles, verify at each GC round +TEST_F(RGVisibilityTest, GcSnapshot_Randomized) { + std::mt19937 gen(42); + std::uniform_int_distribution rowDist(0, ROW_COUNT - 1); + std::vector deleted(ROW_COUNT, false); + uint64_t ts = 1; + uint64_t lastGcTs = 0; + + for (int round = 0; round < 10; round++) { + for (int d = 0; d < 100; d++) { + uint32_t rowId; + do { rowId = rowDist(gen); } while (deleted[rowId]); + deleted[rowId] = true; + rgVisibility->deleteRGRecord(rowId, ts); + ts++; + } + + uint64_t gcTs = lastGcTs + 51; + if (gcTs >= ts) gcTs = ts - 1; + + uint64_t* preRef = rgVisibility->getRGVisibilityBitmap(gcTs); + std::vector snapshot = rgVisibility->collectRGGarbage(gcTs); + verifyGcSnapshot(rgVisibility, gcTs, preRef, snapshot); + delete[] preRef; + lastGcTs = gcTs; + } + + // Final GC beyond all timestamps + uint64_t* preRefFinal = rgVisibility->getRGVisibilityBitmap(ts + 100); + std::vector finalSnap = rgVisibility->collectRGGarbage(ts + 100); + verifyGcSnapshot(rgVisibility, ts + 100, preRefFinal, finalSnap); + delete[] preRefFinal; +} + +// ===================================================================== +// exportChainItemsAfter tests +// ===================================================================== + +TEST_F(RGVisibilityTest, ExportChainItemsAfter_Basic) { + rgVisibility->deleteRGRecord(5, 50); + rgVisibility->deleteRGRecord(10, 100); + rgVisibility->deleteRGRecord(300, 150); + rgVisibility->deleteRGRecord(500, 200); + + std::vector items = rgVisibility->exportChainItemsAfter(100); + + ASSERT_EQ(items.size(), 4u); + EXPECT_EQ(items[0], 300u); + EXPECT_EQ(items[1], 150u); + EXPECT_EQ(items[2], 500u); + EXPECT_EQ(items[3], 200u); +} + +TEST_F(RGVisibilityTest, ExportChainItemsAfter_Empty) { + std::vector items = rgVisibility->exportChainItemsAfter(100); + EXPECT_EQ(items.size(), 0u); +} + +// ===================================================================== +// importDeletionChain tests +// ===================================================================== + +TEST_F(RGVisibilityTest, ImportDeletionChain_Basic) { + uint64_t items[] = {5, 100, 10, 200, 300, 300}; + rgVisibility->importDeletionChain(items, 3); + + uint64_t* bitmap = rgVisibility->getRGVisibilityBitmap(400); + EXPECT_NE(bitmap[5 / 64] & (1ULL << (5 % 64)), 0u); + EXPECT_NE(bitmap[10 / 64] & (1ULL << (10 % 64)), 0u); + EXPECT_NE(bitmap[300 / 64] & (1ULL << (300 % 64)), 0u); + delete[] bitmap; +} + +TEST_F(RGVisibilityTest, ImportDeletionChain_CrossTile) { + uint64_t items[] = { + 0, 100, + RETINA_CAPACITY - 1, 200, + RETINA_CAPACITY, 300, + RETINA_CAPACITY * 2 + 5, 400 + }; + rgVisibility->importDeletionChain(items, 4); + + uint64_t* bitmap = rgVisibility->getRGVisibilityBitmap(500); + EXPECT_NE(bitmap[0 / 64] & (1ULL << (0 % 64)), 0u); + uint32_t r1 = RETINA_CAPACITY - 1; + EXPECT_NE(bitmap[r1 / 64] & (1ULL << (r1 % 64)), 0u); + uint32_t r2 = RETINA_CAPACITY; + EXPECT_NE(bitmap[r2 / 64] & (1ULL << (r2 % 64)), 0u); + uint32_t r3 = RETINA_CAPACITY * 2 + 5; + EXPECT_NE(bitmap[r3 / 64] & (1ULL << (r3 % 64)), 0u); + delete[] bitmap; +} + +// ===================================================================== +// Export → Import end-to-end with coordinate mapping +// ===================================================================== + +TEST_F(RGVisibilityTest, ExportImportEndToEnd) { + uint64_t safeGcTs = 100; + + rgVisibility->deleteRGRecord(5, 50); + rgVisibility->deleteRGRecord(10, 80); + rgVisibility->deleteRGRecord(15, 150); + rgVisibility->deleteRGRecord(20, 200); + rgVisibility->deleteRGRecord(300, 250); + + std::vector exported = rgVisibility->exportChainItemsAfter(safeGcTs); + ASSERT_EQ(exported.size(), 6u); + + RGVisibilityInstance newRgVis(ROW_COUNT, safeGcTs, nullptr); + + newRgVis.importDeletionChain(exported.data(), exported.size() / 2); + + for (uint64_t snapTs : {150ULL, 200ULL, 250ULL, 500ULL}) { + uint64_t* oldBitmap = rgVisibility->getRGVisibilityBitmap(snapTs); + uint64_t* newBitmap = newRgVis.getRGVisibilityBitmap(snapTs); + + for (uint32_t row : {15u, 20u, 300u}) { + bool oldSet = (oldBitmap[row / 64] & (1ULL << (row % 64))) != 0; + bool newSet = (newBitmap[row / 64] & (1ULL << (row % 64))) != 0; + EXPECT_EQ(oldSet, newSet) + << "Mismatch at row=" << row << " snapTs=" << snapTs; + } + + for (uint32_t row : {5u, 10u}) { + uint64_t* newCheck = newRgVis.getRGVisibilityBitmap(snapTs); + bool newSet = (newCheck[row / 64] & (1ULL << (row % 64))) != 0; + EXPECT_FALSE(newSet) + << "Row " << row << " (ts<=safeGcTs) should NOT be in new chain at snapTs=" << snapTs; + delete[] newCheck; + } + + delete[] oldBitmap; + delete[] newBitmap; + } +} + +// ===================================================================== +// Concurrent dual-write + importDeletionChain +// ===================================================================== + +TEST_F(RGVisibilityTest, ImportDeletionChain_ConcurrentDualWrite) { + constexpr int IMPORT_ITEMS = 50; + constexpr int DUAL_WRITE_ITEMS = 200; + uint64_t safeGcTs = 100; + + RGVisibilityInstance newRgVis(ROW_COUNT, safeGcTs, nullptr); + + std::vector importItems; + importItems.reserve(IMPORT_ITEMS * 2); + for (int i = 0; i < IMPORT_ITEMS; i++) { + uint32_t row = 1000 + i; + uint64_t ts = safeGcTs + 1 + i; + importItems.push_back(row); + importItems.push_back(ts); + } + + std::atomic importDone{false}; + + std::thread dualWriteThread([&]() { + for (int i = 0; i < DUAL_WRITE_ITEMS; i++) { + uint32_t row = 2000 + i; + uint64_t ts = safeGcTs + 500 + i; + newRgVis.deleteRGRecord(row, ts); + } + }); + + newRgVis.importDeletionChain(importItems.data(), IMPORT_ITEMS); + importDone.store(true, std::memory_order_release); + + dualWriteThread.join(); + + uint64_t queryTs = safeGcTs + 500 + DUAL_WRITE_ITEMS + 100; + uint64_t* bitmap = newRgVis.getRGVisibilityBitmap(queryTs); + + for (int i = 0; i < IMPORT_ITEMS; i++) { + uint32_t row = 1000 + i; + EXPECT_NE(bitmap[row / 64] & (1ULL << (row % 64)), 0u) + << "Imported row " << row << " missing from bitmap"; + } + for (int i = 0; i < DUAL_WRITE_ITEMS; i++) { + uint32_t row = 2000 + i; + EXPECT_NE(bitmap[row / 64] & (1ULL << (row % 64)), 0u) + << "Dual-write row " << row << " missing from bitmap"; + } + + delete[] bitmap; } \ No newline at end of file diff --git a/cpp/pixels-retina/test/TileVisibilityTest.cpp b/cpp/pixels-retina/test/TileVisibilityTest.cpp index 83fe099bb9..0a84b806f9 100644 --- a/cpp/pixels-retina/test/TileVisibilityTest.cpp +++ b/cpp/pixels-retina/test/TileVisibilityTest.cpp @@ -68,6 +68,11 @@ class TileVisibilityTest : public ::testing::Test { return true; } + void collectGarbage(uint64_t ts) { + uint64_t buf[BITMAP_SIZE] = {0}; + v->collectTileGarbage(ts, buf); + } + TileVisibility* v; }; @@ -89,7 +94,7 @@ TEST_F(TileVisibilityTest, BaseFunction) { SET_BITMAP_BIT(expectedBitmap, 2); EXPECT_TRUE(checkBitmap(actualBitmap, expectedBitmap)); - v->collectTileGarbage(101); + collectGarbage(101); v->getTileVisibilityBitmap(101, actualBitmap); EXPECT_TRUE(checkBitmap(actualBitmap, expectedBitmap)); } @@ -111,7 +116,7 @@ TEST_F(TileVisibilityTest, GarbageCollect) { for (int i = 0; i < count; i++) { v->deleteTileRecord(i, i + 100); } - v->collectTileGarbage(150); + collectGarbage(150); uint64_t actualBitmap[BITMAP_SIZE] = {0}; uint64_t expectedBitmap[BITMAP_SIZE] = {0}; @@ -123,7 +128,7 @@ TEST_F(TileVisibilityTest, GarbageCollect) { for (int i = 51; i < count; i++) { SET_BITMAP_BIT(expectedBitmap, i); } - v->collectTileGarbage(100 + count); + collectGarbage(100 + count); v->getTileVisibilityBitmap(100 + count, actualBitmap); EXPECT_TRUE(checkBitmap(actualBitmap, expectedBitmap)); } @@ -268,3 +273,425 @@ TEST_F(TileVisibilityTest, MultiThread) { EXPECT_TRUE(checkBitmap(finalBitmap, expectedFinalBitmap)); } + +/** + * ZeroSentinelInGarbageCollect — deterministic regression for Scenario 2 guard. + * + * The fix for the full-block race (Scenario 2) relies on treating item=0 as a + * sentinel that marks uninitialised tail slots. makeDeleteIndex(rowId=0, ts=0)=0, + * which is identical to the zero-initialised memory of a freshly-allocated block. + * + * Precondition enforced by TransService: all valid transaction timestamps are > 0, + * so ts=0 can never represent a real deletion and is safe to use as a sentinel. + * + * This test simulates the exact item value produced by the race without requiring + * concurrent execution: + * 1. Fill BLOCK_CAPACITY-1 slots with valid (rowId, ts) pairs. + * 2. Insert makeDeleteIndex(0,0)=0 into the last slot — the same value a + * zero-initialised slot in a new block would have during the race window. + * 3. Run GC: without the fix, extractTimestamp(0)=0 ≤ ts would SET_BITMAP_BIT(0). + * with the fix, `if (item == 0) break` stops before touching bit 0. + * + * Failure mode WITHOUT fix: bits 0..BLOCK_CAPACITY-2 set (bit 0 is spurious). + * Pass condition WITH fix: bits 1..BLOCK_CAPACITY-2 set, bit 0 NOT set. + */ +TEST_F(TileVisibilityTest, ZeroSentinelInGarbageCollect) { + // Fill slots 0..BLOCK_CAPACITY-2 with valid items (rows 1..7, ts 1..7) + for (uint16_t i = 1; i < DeleteIndexBlock::BLOCK_CAPACITY; i++) { + v->deleteTileRecord(i, static_cast(i)); + } + // Insert the sentinel value (row=0, ts=0 → item=0) into the final slot. + // This replicates the zero-initialised items[1..7] that GC would encounter + // during the Scenario-2 race if tailUsed were stale at BLOCK_CAPACITY. + v->deleteTileRecord(0, 0); + + collectGarbage(100); + + uint64_t actualBitmap[BITMAP_SIZE] = {0}; + v->getTileVisibilityBitmap(100, actualBitmap); + + // Rows 1..(BLOCK_CAPACITY-1) should be deleted; row 0 must NOT be set. + uint64_t expectedBitmap[BITMAP_SIZE] = {0}; + for (uint16_t i = 1; i < DeleteIndexBlock::BLOCK_CAPACITY; i++) { + SET_BITMAP_BIT(expectedBitmap, i); + } + EXPECT_TRUE(checkBitmap(actualBitmap, expectedBitmap)) + << "Row 0 must not be set: item==0 sentinel guard must stop GC " + "before processing zero-initialised (or ts=0) slots"; +} + +/** + * ConcurrentGCAndFirstInsert — targets Scenario 1 (empty-list path race). + * + * Race condition: + * deleteTileRecord (empty list) does: + * 1. tail.CAS(nullptr → newBlk) + * 2. currentVersion.CAS(oldVer → newVer with head=newBlk) ← head now visible + * 3. tailUsed.store(1) ← window: 2 done, 3 not yet + * + * If collectTileGarbage runs between steps 2 and 3: + * - blk = newVer->head = newBlk (reachable) + * - count = tailUsed = 0 (not yet updated) + * - BEFORE FIX: items[count-1] = items[size_t(-1)] → size_t underflow → UB / crash + * - AFTER FIX: count==0 guard breaks out safely; no crash. + * + * NOTE on test reliability: the race window is between two adjacent atomic operations + * (currentVersion.CAS at line ~99 and tailUsed.store at line ~102 in deleteTileRecord). + * This is too narrow to trigger reliably with OS-level scheduling alone; the test is + * therefore a probabilistic stress test rather than a deterministic reproducer. For + * guaranteed detection, compile with AddressSanitizer + ThreadSanitizer or add a + * -DENABLE_TEST_HOOKS build flag that injects a sleep between the two operations. + * + * The primary value of this test is as a no-crash regression guard: if the count==0 + * guard is removed, a crash (size_t underflow → OOB array access) will eventually + * surface under sustained concurrent load even if it is not triggered every run. + */ +TEST_F(TileVisibilityTest, ConcurrentGCAndFirstInsert) { + constexpr int TRIALS = 200; + + for (int trial = 0; trial < TRIALS; trial++) { + delete v; + v = new TileVisibility(); + + std::atomic deleteStarted{false}; + std::atomic gcDone{false}; + + // GC thread: spin-waits until the delete thread has signalled it started, + // then immediately fires GC to maximise the chance of hitting the race window. + auto gcThread = std::thread([&]() { + while (!deleteStarted.load(std::memory_order_acquire)) {} + collectGarbage(1000); + gcDone.store(true, std::memory_order_release); + }); + + // Delete thread: signals start, then inserts the very first item (row=5, ts=100). + // Row 0 is intentionally never deleted so we can use bit 0 as a spurious-set + // canary in the companion scenario-2 test. + deleteStarted.store(true, std::memory_order_release); + v->deleteTileRecord(5, 100); + + gcThread.join(); + + // After both operations complete, GC with a ts that covers the inserted item + // and verify the bitmap is exactly {row 5 deleted}. + collectGarbage(1000); + uint64_t actualBitmap[BITMAP_SIZE] = {0}; + v->getTileVisibilityBitmap(1000, actualBitmap); + + uint64_t expectedBitmap[BITMAP_SIZE] = {0}; + SET_BITMAP_BIT(expectedBitmap, 5); + + EXPECT_TRUE(checkBitmap(actualBitmap, expectedBitmap)) + << "Trial " << trial << ": bitmap incorrect after concurrent first-insert + GC"; + } +} + +/** + * ConcurrentGCAndBlockTransition — targets Scenario 2 (full-block path race). + * + * Race condition: + * deleteTileRecord (old tail block is full) does: + * 1. curTail->next.CAS(nullptr → newBlk) + * 2. tail.CAS(curTail → newBlk) ← tail now points to new block + * 3. tailUsed.store(1) ← window: 2 done, 3 not yet + * + * If collectTileGarbage runs between steps 2 and 3: + * - blk == tail (newBlk), count = tailUsed = BLOCK_CAPACITY (stale old value, 8) + * - items[0] is the real insertion; items[1..BLOCK_CAPACITY-1] are zero-initialised + * - BEFORE FIX: extractTimestamp(0)=0 ≤ ts → SET_BITMAP_BIT(extractRowId(0)=0) + * → bit 0 spuriously set in baseBitmap (persistent data corruption) + * - AFTER FIX: item==0 guard breaks the inner loop; no spurious bit 0. + * + * Strategy: pre-fill exactly BLOCK_CAPACITY items (one full block) with ts values + * that GC will compact, then concurrently fire GC and the (BLOCK_CAPACITY+1)-th + * insert that triggers new-block creation. Row 0 is never deleted; if the bug fires, + * getTileVisibilityBitmap will report bit 0 set even though row 0 was never deleted. + * + * NOTE on test reliability: identical narrow-window caveat as ConcurrentGCAndFirstInsert. + * The ZeroSentinelInGarbageCollect test above is the deterministic companion that + * verifies the item==0 guard logic directly without requiring concurrent execution. + */ +TEST_F(TileVisibilityTest, ConcurrentGCAndBlockTransition) { + constexpr uint64_t GC_TS = 1000; + // Number of concurrent trials; more iterations → higher probability of hitting the race. + constexpr int TRIALS = 500; + + std::atomic spuriousRow0{false}; + + for (int trial = 0; trial < TRIALS && !spuriousRow0.load(); trial++) { + delete v; + v = new TileVisibility(); + + // Pre-fill exactly BLOCK_CAPACITY (8) items so the next insert triggers + // the full-block → new-block code path. Use rows 1..8 (never row 0). + for (size_t i = 0; i < DeleteIndexBlock::BLOCK_CAPACITY; i++) { + v->deleteTileRecord(static_cast(i + 1), i + 1); + } + + std::atomic insertReady{false}; + + // GC thread: waits for the insert thread to be about to create the new block, + // then fires GC immediately to race with tail/tailUsed update. + auto gcThread = std::thread([&]() { + while (!insertReady.load(std::memory_order_acquire)) {} + collectGarbage(GC_TS); + }); + + // Insert thread: signal then insert the (BLOCK_CAPACITY+1)-th item to force + // new-block creation. Row 0 is the canary — never intentionally deleted. + insertReady.store(true, std::memory_order_release); + v->deleteTileRecord(10, DeleteIndexBlock::BLOCK_CAPACITY + 1); + + gcThread.join(); + + // Run one more clean GC to ensure everything that should be compacted is. + collectGarbage(GC_TS); + + // Check the canary: bit 0 must be 0 because row 0 was never deleted. + uint64_t bitmap[BITMAP_SIZE] = {0}; + v->getTileVisibilityBitmap(GC_TS, bitmap); + + if (GET_BITMAP_BIT(bitmap, 0)) { + spuriousRow0.store(true); + ADD_FAILURE() << "Trial " << trial + << ": bit 0 spuriously set in bitmap — " + << "stale tailUsed race bug triggered (Scenario 2)"; + } + } + + EXPECT_FALSE(spuriousRow0.load()) + << "Row 0 was spuriously marked deleted by GC processing " + "zero-initialised slots of a newly created tail block."; +} + +// ========================================================================= +// exportChainItemsAfter tests +// ========================================================================= + +TEST_F(TileVisibilityTest, ExportChainItemsAfter_Basic) { + v->deleteTileRecord(1, 50); + v->deleteTileRecord(2, 100); + v->deleteTileRecord(3, 150); + v->deleteTileRecord(4, 200); + v->deleteTileRecord(5, 250); + + std::vector> items; + v->exportChainItemsAfter(0, 100, items); + + ASSERT_EQ(items.size(), 3u); + EXPECT_EQ(items[0].second, 150u); + EXPECT_EQ(items[1].second, 200u); + EXPECT_EQ(items[2].second, 250u); + EXPECT_EQ(items[0].first, 0u * RETINA_CAPACITY + 3u); + EXPECT_EQ(items[1].first, 0u * RETINA_CAPACITY + 4u); + EXPECT_EQ(items[2].first, 0u * RETINA_CAPACITY + 5u); +} + +TEST_F(TileVisibilityTest, ExportChainItemsAfter_AllAbove) { + v->deleteTileRecord(1, 200); + v->deleteTileRecord(2, 300); + + std::vector> items; + v->exportChainItemsAfter(0, 100, items); + + ASSERT_EQ(items.size(), 2u); + EXPECT_EQ(items[0].second, 200u); + EXPECT_EQ(items[1].second, 300u); +} + +TEST_F(TileVisibilityTest, ExportChainItemsAfter_AllBelow) { + v->deleteTileRecord(1, 50); + v->deleteTileRecord(2, 80); + v->deleteTileRecord(3, 100); + + std::vector> items; + v->exportChainItemsAfter(0, 100, items); + EXPECT_EQ(items.size(), 0u); +} + +TEST_F(TileVisibilityTest, ExportChainItemsAfter_EmptyChain) { + std::vector> items; + v->exportChainItemsAfter(0, 100, items); + EXPECT_EQ(items.size(), 0u); +} + +TEST_F(TileVisibilityTest, ExportChainItemsAfter_AfterGC) { + v->deleteTileRecord(1, 50); + v->deleteTileRecord(2, 100); + v->deleteTileRecord(3, 150); + v->deleteTileRecord(4, 200); + + collectGarbage(100); + + std::vector> items; + v->exportChainItemsAfter(0, 100, items); + + ASSERT_EQ(items.size(), 2u); + EXPECT_EQ(items[0].second, 150u); + EXPECT_EQ(items[1].second, 200u); +} + +TEST_F(TileVisibilityTest, ExportChainItemsAfter_MultiBlock) { + for (uint16_t i = 0; i < 20; i++) { + v->deleteTileRecord(i + 10, (i + 1) * 10); + } + + std::vector> items; + v->exportChainItemsAfter(0, 100, items); + + ASSERT_EQ(items.size(), 10u); + for (size_t i = 0; i < 10; i++) { + EXPECT_EQ(items[i].second, (i + 11) * 10); + EXPECT_EQ(items[i].first, 0u * RETINA_CAPACITY + (i + 20)); + } +} + +// ========================================================================= +// importDeletionItems tests +// ========================================================================= + +TEST_F(TileVisibilityTest, ImportDeletionItems_Basic) { + std::vector bucket; + bucket.push_back(makeDeleteIndex(1, 100)); + bucket.push_back(makeDeleteIndex(2, 200)); + bucket.push_back(makeDeleteIndex(5, 300)); + bucket.push_back(makeDeleteIndex(10, 400)); + + v->importDeletionItems(bucket); + + uint64_t actualBitmap[BITMAP_SIZE] = {0}; + v->getTileVisibilityBitmap(500, actualBitmap); + + uint64_t expectedBitmap[BITMAP_SIZE] = {0}; + SET_BITMAP_BIT(expectedBitmap, 1); + SET_BITMAP_BIT(expectedBitmap, 2); + SET_BITMAP_BIT(expectedBitmap, 5); + SET_BITMAP_BIT(expectedBitmap, 10); + EXPECT_TRUE(checkBitmap(actualBitmap, expectedBitmap)); +} + +TEST_F(TileVisibilityTest, ImportDeletionItems_EmptyBucket) { + v->deleteTileRecord(1, 100); + + std::vector empty; + v->importDeletionItems(empty); + + uint64_t actualBitmap[BITMAP_SIZE] = {0}; + v->getTileVisibilityBitmap(200, actualBitmap); + uint64_t expectedBitmap[BITMAP_SIZE] = {0}; + SET_BITMAP_BIT(expectedBitmap, 1); + EXPECT_TRUE(checkBitmap(actualBitmap, expectedBitmap)); +} + +TEST_F(TileVisibilityTest, ImportDeletionItems_MultiBlock) { + std::vector bucket; + uint64_t expectedBitmap[BITMAP_SIZE] = {0}; + for (uint16_t i = 0; i < 20; i++) { + bucket.push_back(makeDeleteIndex(i + 10, (i + 1) * 10)); + SET_BITMAP_BIT(expectedBitmap, i + 10); + } + + v->importDeletionItems(bucket); + + uint64_t actualBitmap[BITMAP_SIZE] = {0}; + v->getTileVisibilityBitmap(300, actualBitmap); + EXPECT_TRUE(checkBitmap(actualBitmap, expectedBitmap)); +} + +TEST_F(TileVisibilityTest, ImportDeletionItems_Padding) { + std::vector bucket; + for (uint16_t i = 0; i < 5; i++) { + bucket.push_back(makeDeleteIndex(i + 1, (i + 1) * 100)); + } + + v->importDeletionItems(bucket); + + uint64_t expectedBitmap[BITMAP_SIZE] = {0}; + for (uint16_t i = 0; i < 5; i++) SET_BITMAP_BIT(expectedBitmap, i + 1); + + uint64_t actualBitmap[BITMAP_SIZE] = {0}; + v->getTileVisibilityBitmap(600, actualBitmap); + EXPECT_TRUE(checkBitmap(actualBitmap, expectedBitmap)); + + uint64_t partialBitmap[BITMAP_SIZE] = {0}; + v->getTileVisibilityBitmap(250, partialBitmap); + uint64_t partialExpected[BITMAP_SIZE] = {0}; + SET_BITMAP_BIT(partialExpected, 1); + SET_BITMAP_BIT(partialExpected, 2); + EXPECT_TRUE(checkBitmap(partialBitmap, partialExpected)); +} + +// ========================================================================= +// importDeletionItems — truncation dedup +// ========================================================================= + +TEST_F(TileVisibilityTest, ImportDeletionItems_TruncationDedup) { + v->deleteTileRecord(20, 300); + v->deleteTileRecord(21, 400); + + std::vector bucket; + bucket.push_back(makeDeleteIndex(1, 100)); + bucket.push_back(makeDeleteIndex(2, 200)); + bucket.push_back(makeDeleteIndex(3, 300)); + bucket.push_back(makeDeleteIndex(4, 400)); + bucket.push_back(makeDeleteIndex(5, 500)); + + v->importDeletionItems(bucket); + + uint64_t expectedBitmap[BITMAP_SIZE] = {0}; + SET_BITMAP_BIT(expectedBitmap, 1); + SET_BITMAP_BIT(expectedBitmap, 2); + SET_BITMAP_BIT(expectedBitmap, 3); + SET_BITMAP_BIT(expectedBitmap, 20); + SET_BITMAP_BIT(expectedBitmap, 21); + SET_BITMAP_BIT(expectedBitmap, 4); + SET_BITMAP_BIT(expectedBitmap, 5); + + uint64_t actualBitmap[BITMAP_SIZE] = {0}; + v->getTileVisibilityBitmap(600, actualBitmap); + EXPECT_TRUE(checkBitmap(actualBitmap, expectedBitmap)); +} + +TEST_F(TileVisibilityTest, ImportDeletionItems_FullOverlap) { + v->deleteTileRecord(20, 100); + v->deleteTileRecord(21, 200); + + std::vector bucket; + bucket.push_back(makeDeleteIndex(1, 200)); + bucket.push_back(makeDeleteIndex(2, 300)); + + v->importDeletionItems(bucket); + + uint64_t expectedBitmap[BITMAP_SIZE] = {0}; + SET_BITMAP_BIT(expectedBitmap, 20); + SET_BITMAP_BIT(expectedBitmap, 21); + + uint64_t actualBitmap[BITMAP_SIZE] = {0}; + v->getTileVisibilityBitmap(500, actualBitmap); + EXPECT_TRUE(checkBitmap(actualBitmap, expectedBitmap)); +} + +// ========================================================================= +// importDeletionItems — empty chain tail claim + subsequent deletes +// ========================================================================= + +TEST_F(TileVisibilityTest, ImportDeletionItems_EmptyChainTailClaim) { + std::vector bucket; + bucket.push_back(makeDeleteIndex(1, 100)); + bucket.push_back(makeDeleteIndex(2, 200)); + + v->importDeletionItems(bucket); + + v->deleteTileRecord(5, 300); + v->deleteTileRecord(6, 400); + + uint64_t expectedBitmap[BITMAP_SIZE] = {0}; + SET_BITMAP_BIT(expectedBitmap, 1); + SET_BITMAP_BIT(expectedBitmap, 2); + SET_BITMAP_BIT(expectedBitmap, 5); + SET_BITMAP_BIT(expectedBitmap, 6); + + uint64_t actualBitmap[BITMAP_SIZE] = {0}; + v->getTileVisibilityBitmap(500, actualBitmap); + EXPECT_TRUE(checkBitmap(actualBitmap, expectedBitmap)); +} diff --git a/pixels-common/src/main/java/io/pixelsdb/pixels/common/error/ErrorCode.java b/pixels-common/src/main/java/io/pixelsdb/pixels/common/error/ErrorCode.java index 6de78732da..2bcd676994 100644 --- a/pixels-common/src/main/java/io/pixelsdb/pixels/common/error/ErrorCode.java +++ b/pixels-common/src/main/java/io/pixelsdb/pixels/common/error/ErrorCode.java @@ -83,6 +83,8 @@ public class ErrorCode public static final int METADATA_UPDATE_SINGLE_POINT_INDEX_FAILED = (ERROR_BASE_METADATA + 51); public static final int METADATA_DELETE_SINGLE_POINT_INDEX_FAILED = (ERROR_BASE_METADATA + 52); public static final int METADATA_ADD_RETINA_BUFFER_FAILED = (ERROR_BASE_METADATA + 53); + public static final int METADATA_GET_FILE_BY_ID_FAILED = (ERROR_BASE_METADATA + 54); + public static final int METADATA_ATOMIC_SWAP_FILES_FAILED = (ERROR_BASE_METADATA + 55); // end error code for metadata rpc // begin error code for shared memory message queue diff --git a/pixels-common/src/main/java/io/pixelsdb/pixels/common/metadata/MetadataService.java b/pixels-common/src/main/java/io/pixelsdb/pixels/common/metadata/MetadataService.java index 74df929c31..8835f63ac7 100644 --- a/pixels-common/src/main/java/io/pixelsdb/pixels/common/metadata/MetadataService.java +++ b/pixels-common/src/main/java/io/pixelsdb/pixels/common/metadata/MetadataService.java @@ -1505,6 +1505,68 @@ public boolean deleteFiles(List fileIds) throws MetadataException return false; } + /** + * Get a file by its id. + * @param fileId the file id + * @return the file, or null if not found + * @throws MetadataException if the request fails + */ + public File getFileById(long fileId) throws MetadataException + { + String token = UUID.randomUUID().toString(); + MetadataProto.GetFileByIdRequest request = MetadataProto.GetFileByIdRequest.newBuilder() + .setHeader(MetadataProto.RequestHeader.newBuilder().setToken(token)) + .setFileId(fileId).build(); + try + { + MetadataProto.GetFileByIdResponse response = this.stub.getFileById(request); + if (response.getHeader().getErrorCode() != 0) + { + return null; + } + if (!response.getHeader().getToken().equals(token)) + { + throw new MetadataException("response token does not match."); + } + return new File(response.getFile()); + } + catch (Exception e) + { + throw new MetadataException("failed to get file by id", e); + } + } + + /** + * Atomically promote a TEMPORARY file to REGULAR and delete the old files. + * @param newFileId the id of the new TEMPORARY file to promote + * @param oldFileIds the ids of old files to delete + * @throws MetadataException if the request fails + */ + public void atomicSwapFiles(long newFileId, List oldFileIds) throws MetadataException + { + String token = UUID.randomUUID().toString(); + MetadataProto.AtomicSwapFilesRequest request = MetadataProto.AtomicSwapFilesRequest.newBuilder() + .setHeader(MetadataProto.RequestHeader.newBuilder().setToken(token)) + .setNewFileId(newFileId).addAllOldFileIds(oldFileIds).build(); + try + { + MetadataProto.AtomicSwapFilesResponse response = this.stub.atomicSwapFiles(request); + if (response.getHeader().getErrorCode() != 0) + { + throw new MetadataException("error code=" + response.getHeader().getErrorCode() + + ", error message=" + response.getHeader().getErrorMsg()); + } + if (!response.getHeader().getToken().equals(token)) + { + throw new MetadataException("response token does not match."); + } + } + catch (Exception e) + { + throw new MetadataException("failed to atomic swap files", e); + } + } + public boolean createPeerPath(String uri, List columns, Path path, Peer peer) throws MetadataException { String token = UUID.randomUUID().toString(); diff --git a/pixels-common/src/main/java/io/pixelsdb/pixels/common/utils/CheckpointFileIO.java b/pixels-common/src/main/java/io/pixelsdb/pixels/common/utils/CheckpointFileIO.java index bd78ab64e1..e83235573a 100644 --- a/pixels-common/src/main/java/io/pixelsdb/pixels/common/utils/CheckpointFileIO.java +++ b/pixels-common/src/main/java/io/pixelsdb/pixels/common/utils/CheckpointFileIO.java @@ -31,10 +31,12 @@ import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.List; +import java.util.Objects; import java.util.concurrent.BlockingQueue; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ExecutorService; import java.util.concurrent.ForkJoinPool; +import java.util.concurrent.TimeUnit; /** * Unified checkpoint file read/write utility class. @@ -73,7 +75,8 @@ public CheckpointEntry(long fileId, int rgId, int recordNum, long[] bitmap) this.fileId = fileId; this.rgId = rgId; this.recordNum = recordNum; - this.bitmap = bitmap; + this.bitmap = Objects.requireNonNull(bitmap, + "bitmap must not be null for fileId=" + fileId + ", rgId=" + rgId); } } @@ -99,7 +102,7 @@ private CheckpointFileIO() * @param path the file path * @param totalRgs total number of entries to write * @param queue blocking queue containing CheckpointEntry objects - * @throws Exception if writing fails + * @throws Exception if writing fails or a producer does not deliver an entry within the timeout */ public static void writeCheckpoint(String path, int totalRgs, BlockingQueue queue) throws Exception { @@ -109,7 +112,12 @@ public static void writeCheckpoint(String path, int totalRgs, BlockingQueueFile types * * - * - * + * * * * @@ -80,7 +80,6 @@ public final class PixelsFileNameUtils */ public enum PxlFileType { - RETINA("retina"), ORDERED("ordered"), COMPACT("compact"), SINGLE("single"), @@ -131,11 +130,11 @@ public static PxlFileType fromLabel(String label) *
  • timestamp — exactly 14 digits (yyyyMMddHHmmss)
  • *
  • atomicCount
  • *
  • virtualNodeId — non-negative integer, or {@code -1} for single files
  • - *
  • type label — one of {@code retina|ordered|compact|single|copy}
  • + *
  • type label — one of {@code ordered|compact|single|copy}
  • * */ private static final Pattern PXL_PATTERN = Pattern.compile( - "(?:.*/)?(.+)_(\\d{14})_(\\d+)_(-?\\d+)_(retina|ordered|compact|single|copy)\\.pxl$"); + "(?:.*/)?(.+)_(\\d{14})_(\\d+)_(-?\\d+)_(ordered|compact|single|copy)\\.pxl$"); private PixelsFileNameUtils() {} @@ -161,16 +160,7 @@ public static String buildPxlFileName(String hostName, int virtualNodeId, PxlFil // ------------------------------------------------------------------------- /** - * Builds a Retina file name (CDC real-time write path). - *

    Format: {@code ____retina.pxl} - */ - public static String buildRetinaFileName(String hostName, int virtualNodeId) - { - return buildPxlFileName(hostName, virtualNodeId, PxlFileType.RETINA); - } - - /** - * Builds an Ordered file name (indexed batch load). + * Builds an Ordered file name (CDC real-time write path and indexed batch load). *

    Format: {@code ____ordered.pxl} */ public static String buildOrderedFileName(String hostName, int virtualNodeId) @@ -281,8 +271,7 @@ public static PxlFileType extractFileType(String path) /** * Returns {@code true} if the file at {@code path} is eligible for Storage GC, - * i.e. its type is one of {@link PxlFileType#RETINA}, {@link PxlFileType#ORDERED}, - * or {@link PxlFileType#COMPACT}. + * i.e. its type is one of {@link PxlFileType#ORDERED} or {@link PxlFileType#COMPACT}. * *

    {@link PxlFileType#SINGLE} and {@link PxlFileType#COPY} files, as well as * unrecognised paths, return {@code false}. @@ -290,7 +279,7 @@ public static PxlFileType extractFileType(String path) public static boolean isGcEligible(String path) { PxlFileType type = extractFileType(path); - return type == PxlFileType.RETINA || type == PxlFileType.ORDERED || type == PxlFileType.COMPACT; + return type == PxlFileType.ORDERED || type == PxlFileType.COMPACT; } // ------------------------------------------------------------------------- diff --git a/pixels-common/src/main/java/io/pixelsdb/pixels/common/utils/RetinaUtils.java b/pixels-common/src/main/java/io/pixelsdb/pixels/common/utils/RetinaUtils.java index 3545aa59ce..dc17eac21b 100644 --- a/pixels-common/src/main/java/io/pixelsdb/pixels/common/utils/RetinaUtils.java +++ b/pixels-common/src/main/java/io/pixelsdb/pixels/common/utils/RetinaUtils.java @@ -130,4 +130,54 @@ public static String getCheckpointPrefix(String typePrefix, String hostname) { return typePrefix + hostname + "_"; } + + /** + * Builds the checkpoint file path from a directory, prefix, hostname and timestamp. + * + * @param checkpointDir directory where checkpoint files reside (may or may not end with '/') + * @param prefix {@link #CHECKPOINT_PREFIX_GC} or {@link #CHECKPOINT_PREFIX_OFFLOAD} + * @param hostname the retina host name + * @param timestamp the GC or offload timestamp + */ + public static String buildCheckpointPath(String checkpointDir, String prefix, String hostname, long timestamp) + { + String fileName = getCheckpointFileName(prefix, hostname, timestamp); + return checkpointDir.endsWith("/") ? checkpointDir + fileName : checkpointDir + "/" + fileName; + } + + // ── writeBufferKey utilities ──────────────────────────────────── + + /** + * Builds the canonical key for {@code pixelsWriteBufferMap} from schema and table name. + */ + public static String buildWriteBufferKey(String schemaName, String tableName) + { + return schemaName + "." + tableName; + } + + // ── rgKey utilities ────────────────────────────────────────────── + + /** + * Builds the canonical {@code rgVisibilityMap} key for a row group. + */ + public static String buildRgKey(long fileId, int rgId) + { + return fileId + "_" + rgId; + } + + /** + * Extracts the file ID from an rgKey ({@code "_"}). + */ + public static long parseFileIdFromRgKey(String rgKey) + { + return Long.parseLong(rgKey.substring(0, rgKey.indexOf('_'))); + } + + /** + * Extracts the row group ID from an rgKey ({@code "_"}). + */ + public static int parseRgIdFromRgKey(String rgKey) + { + return Integer.parseInt(rgKey.substring(rgKey.indexOf('_') + 1)); + } } diff --git a/pixels-common/src/main/resources/pixels.properties b/pixels-common/src/main/resources/pixels.properties index cc9bd3122c..cdfad1b847 100644 --- a/pixels-common/src/main/resources/pixels.properties +++ b/pixels-common/src/main/resources/pixels.properties @@ -265,7 +265,7 @@ cpuspl = [10000,60000,300000,600000] # split mem (G) memspl = [1,8,16,32,64] -### pixels-retina - write buffer flush configuration ### +### pixels-retina ### # set to true to enable pixels-retina retina.enable=false @@ -304,7 +304,21 @@ pixels.transaction.offload.threshold=1800 # lease duration for retina offload cache in seconds, default 600s retina.offload.cache.lease.duration=600 # snapshot storage directory -pixels.retina.checkpoint.dir=file:///tmp/pixels-checkpoints +retina.checkpoint.dir=file:///tmp/pixels-checkpoints +# set to true to enable storage GC (rewrites high-deletion-ratio files to reclaim space) +retina.storage.gc.enabled=false +# invalidRatio must be strictly greater than this value for a file to be a GC candidate +retina.storage.gc.threshold=0.5 +# target size in bytes for rewritten files produced by Storage GC, default 128MB +retina.storage.gc.target.file.size=134217728 +# maximum number of old files per FileGroup (controls per-rewrite workload) +retina.storage.gc.max.files.per.group=20 +# maximum number of (tableId, virtualNodeId) file groups processed per GC cycle +retina.storage.gc.max.file.groups.per.run=10 +# encoding level for rewritten files produced by Storage GC (0=EL0, 1=EL1, 2=EL2) +retina.storage.gc.encoding.level=2 +# hours to keep old files after atomic swap before physical deletion (wall-clock delay) +retina.storage.gc.file.retire.delay.hours=24 ### pixels-sink ### sink.server.enabled=false diff --git a/pixels-core/src/main/java/io/pixelsdb/pixels/core/TypeDescription.java b/pixels-core/src/main/java/io/pixelsdb/pixels/core/TypeDescription.java index 2774e7f79e..6bf3e1fe3b 100644 --- a/pixels-core/src/main/java/io/pixelsdb/pixels/core/TypeDescription.java +++ b/pixels-core/src/main/java/io/pixelsdb/pixels/core/TypeDescription.java @@ -1820,6 +1820,55 @@ public static boolean match(int mode1, int mode2) } } + /** + * Serializes one cell from a {@link ColumnVector} at the given row index into + * the canonical byte format, using the same encoding as {@link #convertSqlStringToByte}. + */ + public byte[] convertColumnVectorToByte(ColumnVector col, int row) + { + switch (getCategory()) + { + case BOOLEAN: + case BYTE: + return new byte[]{(byte) ((LongColumnVector) col).vector[row]}; + case SHORT: + case INT: + case DATE: + case TIME: + return ByteBuffer.allocate(Integer.BYTES).putInt((int) ((LongColumnVector) col).vector[row]).array(); + case LONG: + case TIMESTAMP: + return ByteBuffer.allocate(Long.BYTES).putLong(((LongColumnVector) col).vector[row]).array(); + case FLOAT: + return ByteBuffer.allocate(Integer.BYTES).putInt(((FloatColumnVector) col).vector[row]).array(); + case DOUBLE: + return ByteBuffer.allocate(Long.BYTES).putLong(((DoubleColumnVector) col).vector[row]).array(); + case DECIMAL: + if (getPrecision() <= MAX_SHORT_DECIMAL_PRECISION) + { + return ByteBuffer.allocate(Long.BYTES).putLong(((DecimalColumnVector) col).vector[row]).array(); + } + else + { + LongDecimalColumnVector ldcv = (LongDecimalColumnVector) col; + long high = ldcv.vector[row * 2]; + long low = ldcv.vector[row * 2 + 1]; + return ByteBuffer.allocate(16).putLong(high).putLong(low).array(); + } + case CHAR: + case VARCHAR: + case STRING: + case BINARY: + case VARBINARY: + { + BinaryColumnVector bcv = (BinaryColumnVector) col; + return Arrays.copyOfRange(bcv.vector[row], bcv.start[row], bcv.start[row] + bcv.lens[row]); + } + default: + throw new UnsupportedOperationException("Unsupported column type: " + getCategory()); + } + } + public byte[] convertSqlStringToByte(String value) { if (value == null || value.isEmpty()) diff --git a/pixels-core/src/main/java/io/pixelsdb/pixels/core/reader/PixelsReaderOption.java b/pixels-core/src/main/java/io/pixelsdb/pixels/core/reader/PixelsReaderOption.java index 2317ee6e83..ad176b2f48 100644 --- a/pixels-core/src/main/java/io/pixelsdb/pixels/core/reader/PixelsReaderOption.java +++ b/pixels-core/src/main/java/io/pixelsdb/pixels/core/reader/PixelsReaderOption.java @@ -34,6 +34,7 @@ public class PixelsReaderOption private boolean tolerantSchemaEvolution = true; // this may lead to column missing due to schema evolution private boolean enableEncodedColumnVector = false; // whether read encoded column vectors directly when possible private boolean readIntColumnAsIntVector = false; // whether read int32 columns as int32 column vectors + private boolean exposeHiddenColumn = false; // whether expose the hidden commit timestamp column in the result batch private long transId = -1L; private long transTimestamp = -1L; // -1 means no need to consider the timestamp when reading data private int rgStart = 0; @@ -153,4 +154,15 @@ public boolean isReadIntColumnAsIntVector() { return readIntColumnAsIntVector; } + + public PixelsReaderOption exposeHiddenColumn(boolean exposeHiddenColumn) + { + this.exposeHiddenColumn = exposeHiddenColumn; + return this; + } + + public boolean isExposeHiddenColumn() + { + return exposeHiddenColumn; + } } diff --git a/pixels-core/src/main/java/io/pixelsdb/pixels/core/reader/PixelsRecordReaderImpl.java b/pixels-core/src/main/java/io/pixelsdb/pixels/core/reader/PixelsRecordReaderImpl.java index 534c40c204..bd1ed826e6 100644 --- a/pixels-core/src/main/java/io/pixelsdb/pixels/core/reader/PixelsRecordReaderImpl.java +++ b/pixels-core/src/main/java/io/pixelsdb/pixels/core/reader/PixelsRecordReaderImpl.java @@ -66,7 +66,8 @@ public class PixelsRecordReaderImpl implements PixelsRecordReader private final PixelsReaderOption option; private final long transId; private final long transTimestamp; - private final boolean shouldReadHiddenColumn; + private final boolean filterByHiddenTimestamp; + private final boolean needReadHiddenColumn; private final int RGStart; private int RGLen; private final boolean enableMetrics; @@ -161,7 +162,13 @@ public PixelsRecordReaderImpl(PhysicalReader physicalReader, this.option = option; this.transId = option.getTransId(); this.transTimestamp = option.getTransTimestamp(); - this.shouldReadHiddenColumn = option.hasValidTransTimestamp() && postScript.getHasHiddenColumn(); + if (option.isExposeHiddenColumn() && !postScript.getHasHiddenColumn()) + { + throw new IOException("PixelsReaderOption.exposeHiddenColumn is true, " + + "but the file has no hidden column: " + physicalReader.getPath()); + } + this.filterByHiddenTimestamp = option.hasValidTransTimestamp() && postScript.getHasHiddenColumn(); + this.needReadHiddenColumn = this.filterByHiddenTimestamp || option.isExposeHiddenColumn(); this.RGStart = option.getRGStart(); this.RGLen = option.getRGLen(); this.enableEncodedVector = option.isEnableEncodedColumnVector(); @@ -223,7 +230,7 @@ private void checkBeforeRead() throws IOException includedColumnNum = 0; String[] optionIncludedCols = option.getIncludedCols(); // if size of cols is 0, create an empty row batch - if (!shouldReadHiddenColumn && optionIncludedCols.length == 0) + if (!needReadHiddenColumn && optionIncludedCols.length == 0) { checkValid = true; // Issue #103: init the following members as null. @@ -280,13 +287,13 @@ private void checkBeforeRead() throws IOException // create column readers List columnSchemas = fileSchema.getChildren(); - readers = new ColumnReader[targetColumnNum + (shouldReadHiddenColumn ? 1 : 0)]; + readers = new ColumnReader[targetColumnNum + (needReadHiddenColumn ? 1 : 0)]; for (int i = 0; i < resultColumns.length; i++) { int index = resultColumns[i]; readers[i] = ColumnReader.newColumnReader(columnSchemas.get(index), option); } - if (this.shouldReadHiddenColumn) + if (this.needReadHiddenColumn) { // create reader for the hidden timestamp column readers[readers.length - 1] = ColumnReader.newColumnReader(TypeDescription.HIDDEN_COLUMN_TYPE, option); @@ -349,7 +356,7 @@ private boolean prepareRead() throws IOException { for (int i = 0; i < RGLen; i++) { - if (shouldReadHiddenColumn) + if (filterByHiddenTimestamp) { // get the row group level hidden timestamp column statistics PixelsProto.RowGroupStatistic rowGroupStatistic = rowGroupStatistics.get(RGStart + i); @@ -418,7 +425,7 @@ else if (predicate.matchesNone()) for (int i = 0; i < RGLen; i++) { PixelsProto.RowGroupStatistic rowGroupStatistic = rowGroupStatistics.get(RGStart + i); - if (shouldReadHiddenColumn) + if (filterByHiddenTimestamp) { // get the row group level hidden timestamp column statistics PixelsProto.ColumnStatistic hiddenRowGroupStatistic = @@ -454,7 +461,7 @@ else if (predicate.matchesNone()) { for (int i = 0; i < RGLen; i++) { - if (shouldReadHiddenColumn) + if (filterByHiddenTimestamp) { PixelsProto.RowGroupStatistic rowGroupStatistic = rowGroupStatistics.get(RGStart + i); PixelsProto.ColumnStatistic hiddenRowGroupStatistic = rowGroupStatistic.getHiddenColumnChunkStats(); @@ -472,7 +479,7 @@ else if (predicate.matchesNone()) } } - if (!this.shouldReadHiddenColumn && includedColumnNum == 0) + if (!needReadHiddenColumn && includedColumnNum == 0) { /** * Issue #105: @@ -656,7 +663,7 @@ private boolean read() throws IOException * project nothing, must be count(*). * qualifiedRowNum and endOfFile have been set in prepareRead(); */ - if (!this.shouldReadHiddenColumn && includedColumnNum == 0) + if (!this.needReadHiddenColumn && includedColumnNum == 0) { if (!endOfFile) { @@ -677,10 +684,10 @@ private boolean read() throws IOException } // read chunk offset and length of each target column chunks - // include hidden column if shouldReadHiddenTimestamp is set - int includedColumnNum = includedColumns.length + (shouldReadHiddenColumn ? 1 : 0); - // include hidden column if shouldReadHiddenTimestamp is set - int targetColumnNum = targetColumns.length + (shouldReadHiddenColumn ? 1 : 0); + // include hidden column if filterByHiddenTimestamp or exposeHiddenColumn is true + int includedColumnNum = includedColumns.length + (needReadHiddenColumn ? 1 : 0); + // include hidden column if filterByHiddenTimestamp or exposeHiddenColumn is true + int targetColumnNum = targetColumns.length + (needReadHiddenColumn ? 1 : 0); this.chunkBuffers = new ByteBuffer[targetRGNum * includedColumnNum]; List diskChunks = new ArrayList<>(targetRGNum * targetColumnNum); // read cached data which are in need @@ -739,7 +746,7 @@ private boolean read() throws IOException } } } - if (shouldReadHiddenColumn) + if (needReadHiddenColumn) { // direct cache read is just for debug, so we just get this parameter here for simplicity. boolean direct = Boolean.parseBoolean(ConfigFactory.Instance().getProperty("cache.read.direct")); @@ -795,7 +802,7 @@ private boolean read() throws IOException PixelsProto.RowGroupIndex rowGroupIndex = rowGroupFooters[rgIdx].getRowGroupIndexEntry(); PixelsProto.ColumnChunkIndex chunkIndex = - (this.shouldReadHiddenColumn && colId == includedColumns.length) ? + (colId == includedColumns.length) ? rowGroupIndex.getHiddenColumnChunkIndexEntry() : rowGroupIndex.getColumnChunkIndexEntries(colId); ChunkId diskChunk = new ChunkId(rgIdx, colId, chunkIndex.getChunkOffset(), @@ -823,7 +830,7 @@ private boolean read() throws IOException chunkIndex.getChunkLength()); diskChunks.add(chunk); } - if (shouldReadHiddenColumn) + if (needReadHiddenColumn) { PixelsProto.ColumnChunkIndex chunkIndex = rowGroupIndex.getHiddenColumnChunkIndexEntry(); @@ -955,7 +962,7 @@ public int prepareBatch(int batchSize) throws IOException } } - if (!this.shouldReadHiddenColumn && includedColumnNum == 0) + if (!needReadHiddenColumn && includedColumnNum == 0) { /** * Issue #105: @@ -1069,7 +1076,7 @@ public VectorizedRowBatch readBatch(int batchSize, boolean reuse) } // project nothing, must be count(*) - if (!this.shouldReadHiddenColumn && includedColumnNum == 0) + if (!needReadHiddenColumn && includedColumnNum == 0) { /** * Issue #105: @@ -1092,14 +1099,26 @@ public VectorizedRowBatch readBatch(int batchSize, boolean reuse) { this.resultRowBatch = resultSchema.createRowBatch(batchSize, typeMode, resultColumnsEncoded); this.resultRowBatch.projectionSize = resultColumns.length; + if (option.isExposeHiddenColumn()) + { + this.resultRowBatch.hiddenColumnVector = new LongColumnVector(batchSize); + } } this.resultRowBatch.reset(); this.resultRowBatch.ensureSize(batchSize, false); + if (option.isExposeHiddenColumn() && this.resultRowBatch.hiddenColumnVector != null) + { + this.resultRowBatch.hiddenColumnVector.ensureSize(batchSize, false); + } resultRowBatch = this.resultRowBatch; } else { resultRowBatch = resultSchema.createRowBatch(batchSize, typeMode, resultColumnsEncoded); resultRowBatch.projectionSize = resultColumns.length; + if (option.isExposeHiddenColumn()) + { + resultRowBatch.hiddenColumnVector = new LongColumnVector(batchSize); + } } int rgRowCount = 0; @@ -1111,7 +1130,7 @@ public VectorizedRowBatch readBatch(int batchSize, boolean reuse) rgRowCount = footer.getRowGroupInfos(targetRGs[curRGIdx]).getNumberOfRows(); } - if (option.hasValidTransTimestamp()) + if (filterByHiddenTimestamp || rgVisibilityBitmaps != null) { while (resultRowBatch.size < batchSize && curRowInRG < rgRowCount) { @@ -1124,14 +1143,15 @@ public VectorizedRowBatch readBatch(int batchSize, boolean reuse) LongColumnVector hiddenTimestampVector = null; PixelsProto.RowGroupFooter rowGroupFooter = rowGroupFooters[curRGIdx]; - if (this.shouldReadHiddenColumn) + if (this.needReadHiddenColumn) { // read the hidden timestamp column hiddenTimestampVector = new LongColumnVector(curBatchSize); int hiddenTimestampColId = includedColumns.length; PixelsProto.ColumnEncoding hiddenTimestampEncoding = rowGroupFooter.getRowGroupEncoding() .getHiddenColumnChunkEncoding(); - int hiddenTimestampIndex = curRGIdx * (includedColumns.length + 1) + hiddenTimestampColId; + int hiddenTimestampStride = includedColumns.length + (needReadHiddenColumn ? 1 : 0); + int hiddenTimestampIndex = curRGIdx * hiddenTimestampStride + hiddenTimestampColId; PixelsProto.ColumnChunkIndex hiddenTimestampChunkIndex = rowGroupFooter.getRowGroupIndexEntry() .getHiddenColumnChunkIndexEntry(); readers[readers.length - 1].read(chunkBuffers[hiddenTimestampIndex], hiddenTimestampEncoding, @@ -1162,7 +1182,8 @@ public VectorizedRowBatch readBatch(int batchSize, boolean reuse) { PixelsProto.ColumnEncoding encoding = rowGroupFooter.getRowGroupEncoding() .getColumnChunkEncodings(resultColumns[i]); - int index = curRGIdx * (includedColumns.length + 1) + resultColumns[i]; + int stride = includedColumns.length + (needReadHiddenColumn ? 1 : 0); + int index = curRGIdx * stride + resultColumns[i]; PixelsProto.ColumnChunkIndex chunkIndex = rowGroupFooter.getRowGroupIndexEntry() .getColumnChunkIndexEntries(resultColumns[i]); readers[i].readSelected(chunkBuffers[index], encoding, curRowInRG, curBatchSize, @@ -1170,6 +1191,22 @@ public VectorizedRowBatch readBatch(int batchSize, boolean reuse) } } + // populate hiddenColumnVector for selected rows when requested + if (option.isExposeHiddenColumn() && hiddenTimestampVector != null) + { + int batchOffset = resultRowBatch.size; + int selectedIdx = 0; + for (int i = 0; i < curBatchSize; i++) + { + if (selectedRows.get(i)) + { + resultRowBatch.hiddenColumnVector.vector[batchOffset + selectedIdx] = + hiddenTimestampVector.vector[i]; + selectedIdx++; + } + } + } + // update current row index in the row group curRowInRG += curBatchSize; rowIndex += curBatchSize; @@ -1245,7 +1282,8 @@ public VectorizedRowBatch readBatch(int batchSize, boolean reuse) PixelsProto.RowGroupFooter rowGroupFooter = rowGroupFooters[curRGIdx]; PixelsProto.ColumnEncoding encoding = rowGroupFooter.getRowGroupEncoding() .getColumnChunkEncodings(resultColumns[i]); - int index = curRGIdx * includedColumns.length + resultColumns[i]; + int stride = includedColumns.length + (needReadHiddenColumn ? 1 : 0); + int index = curRGIdx * stride + resultColumns[i]; PixelsProto.ColumnChunkIndex chunkIndex = rowGroupFooter.getRowGroupIndexEntry() .getColumnChunkIndexEntries(resultColumns[i]); readers[i].read(chunkBuffers[index], encoding, curRowInRG, curBatchSize, @@ -1253,6 +1291,22 @@ public VectorizedRowBatch readBatch(int batchSize, boolean reuse) } } + // read and expose hidden column when requested but no transTimestamp filtering + if (option.isExposeHiddenColumn()) + { + PixelsProto.RowGroupFooter rowGroupFooter = rowGroupFooters[curRGIdx]; + int hiddenColId = includedColumns.length; + int hiddenStride = includedColumns.length + (needReadHiddenColumn ? 1 : 0); + int hiddenIdx = curRGIdx * hiddenStride + hiddenColId; + PixelsProto.ColumnEncoding hiddenEncoding = rowGroupFooter.getRowGroupEncoding() + .getHiddenColumnChunkEncoding(); + PixelsProto.ColumnChunkIndex hiddenChunkIndex = rowGroupFooter.getRowGroupIndexEntry() + .getHiddenColumnChunkIndexEntry(); + readers[readers.length - 1].read(chunkBuffers[hiddenIdx], hiddenEncoding, + curRowInRG, curBatchSize, postScript.getPixelStride(), resultRowBatch.size, + resultRowBatch.hiddenColumnVector, hiddenChunkIndex); + } + // update current row index in the row group curRowInRG += curBatchSize; //preRowInRG = curRowInRG; // keep in sync with curRowInRG. diff --git a/pixels-core/src/main/java/io/pixelsdb/pixels/core/vector/VectorizedRowBatch.java b/pixels-core/src/main/java/io/pixelsdb/pixels/core/vector/VectorizedRowBatch.java index 40b9ee098f..ebc6960281 100644 --- a/pixels-core/src/main/java/io/pixelsdb/pixels/core/vector/VectorizedRowBatch.java +++ b/pixels-core/src/main/java/io/pixelsdb/pixels/core/vector/VectorizedRowBatch.java @@ -46,6 +46,12 @@ public class VectorizedRowBatch implements AutoCloseable public int projectionSize; public int maxSize; // capacity, i.e., the maximum number of rows can be stored in this row batch. + /** + * Optional hidden commit-timestamp column vector, populated only when the reader + * is configured with {@code exposeHiddenColumn(true)}. Null otherwise. + */ + public LongColumnVector hiddenColumnVector; + private long memoryUsage = 0L; // If this is true, then there is no data in the batch -- we have hit the end of input. @@ -82,6 +88,7 @@ public VectorizedRowBatch(int numCols, int size) this.size = 0; this.maxSize = size; this.cols = new ColumnVector[numCols]; + this.hiddenColumnVector = null; memoryUsage += (long) Integer.BYTES * (size + numCols) + Integer.BYTES * 6 + Long.BYTES + 2; @@ -216,6 +223,10 @@ public VectorizedRowBatch applyFilter(Bitmap filter) { columnVector.applyFilter(filter, this.size); } + if (hiddenColumnVector != null) + { + hiddenColumnVector.applyFilter(filter, this.size); + } this.size = cardinality; return this; @@ -268,6 +279,19 @@ public void reset() // vc.init(); } } + if (hiddenColumnVector != null) + { + hiddenColumnVector.reset(); + } + } + + /** + * Return the hidden commit-timestamp column vector, or {@code null} if the + * reader was not configured to include hidden columns. + */ + public LongColumnVector getHiddenColumnVector() + { + return hiddenColumnVector; } /** @@ -282,6 +306,10 @@ public void ensureSize(int rows, boolean preserveData) cols[i].ensureSize(rows, preserveData); } } + if (hiddenColumnVector != null) + { + hiddenColumnVector.ensureSize(rows, preserveData); + } if (!preserveData) { this.size = 0; @@ -327,6 +355,11 @@ public void close() } this.cols = null; } + if (hiddenColumnVector != null) + { + hiddenColumnVector.close(); + hiddenColumnVector = null; + } } /** diff --git a/pixels-core/src/test/java/io/pixelsdb/pixels/core/TestTypeDescriptionConvert.java b/pixels-core/src/test/java/io/pixelsdb/pixels/core/TestTypeDescriptionConvert.java new file mode 100644 index 0000000000..b08e62665f --- /dev/null +++ b/pixels-core/src/test/java/io/pixelsdb/pixels/core/TestTypeDescriptionConvert.java @@ -0,0 +1,133 @@ +/* + * Copyright 2026 PixelsDB. + * + * This file is part of Pixels. + * + * Pixels is free software: you can redistribute it and/or modify + * it under the terms of the Affero GNU General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * Pixels is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Affero GNU General Public License for more details. + * + * You should have received a copy of the Affero GNU General Public + * License along with Pixels. If not, see + * . + */ +package io.pixelsdb.pixels.core; + +import io.pixelsdb.pixels.core.vector.BinaryColumnVector; +import io.pixelsdb.pixels.core.vector.DecimalColumnVector; +import io.pixelsdb.pixels.core.vector.DoubleColumnVector; +import io.pixelsdb.pixels.core.vector.FloatColumnVector; +import io.pixelsdb.pixels.core.vector.LongColumnVector; +import org.junit.Test; + +import java.nio.ByteBuffer; +import java.util.Arrays; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +/** + * Tests for {@link TypeDescription#convertColumnVectorToByte( + * io.pixelsdb.pixels.core.vector.ColumnVector, int)}. + * Migrated from TestStorageGarbageCollector where they were misplaced. + */ +public class TestTypeDescriptionConvert +{ + @Test + public void testConvertColumnVectorToByte_int() + { + LongColumnVector col = new LongColumnVector(2); + col.vector[0] = 42; + col.vector[1] = -1; + + byte[] bytes0 = TypeDescription.createInt().convertColumnVectorToByte(col, 0); + assertEquals(Integer.BYTES, bytes0.length); + assertEquals(42, ByteBuffer.wrap(bytes0).getInt()); + + byte[] bytes1 = TypeDescription.createInt().convertColumnVectorToByte(col, 1); + assertEquals(-1, ByteBuffer.wrap(bytes1).getInt()); + } + + @Test + public void testConvertColumnVectorToByte_long() + { + LongColumnVector col = new LongColumnVector(1); + col.vector[0] = Long.MAX_VALUE; + + byte[] bytes = TypeDescription.createLong().convertColumnVectorToByte(col, 0); + assertEquals(Long.BYTES, bytes.length); + assertEquals(Long.MAX_VALUE, ByteBuffer.wrap(bytes).getLong()); + } + + @Test + public void testConvertColumnVectorToByte_float() + { + FloatColumnVector col = new FloatColumnVector(1); + col.vector[0] = Float.floatToIntBits(3.14f); + + byte[] bytes = TypeDescription.createFloat().convertColumnVectorToByte(col, 0); + assertEquals(Integer.BYTES, bytes.length); + assertEquals(Float.floatToIntBits(3.14f), ByteBuffer.wrap(bytes).getInt()); + } + + @Test + public void testConvertColumnVectorToByte_double() + { + DoubleColumnVector col = new DoubleColumnVector(1); + col.vector[0] = Double.doubleToLongBits(2.718); + + byte[] bytes = TypeDescription.createDouble().convertColumnVectorToByte(col, 0); + assertEquals(Long.BYTES, bytes.length); + assertEquals(Double.doubleToLongBits(2.718), ByteBuffer.wrap(bytes).getLong()); + } + + @Test + public void testConvertColumnVectorToByte_string() + { + BinaryColumnVector col = new BinaryColumnVector(2); + byte[] hello = "hello".getBytes(); + byte[] world = "world!".getBytes(); + col.setVal(0, hello); + col.setVal(1, world); + + byte[] bytes0 = TypeDescription.createVarchar(255).convertColumnVectorToByte(col, 0); + assertTrue("string bytes must match", Arrays.equals(hello, bytes0)); + + byte[] bytes1 = TypeDescription.createString().convertColumnVectorToByte(col, 1); + assertTrue("string bytes must match", Arrays.equals(world, bytes1)); + } + + @Test + public void testConvertColumnVectorToByte_shortDecimal() + { + DecimalColumnVector col = new DecimalColumnVector(10, 2); + col.vector[0] = 12345L; + + TypeDescription decType = TypeDescription.createDecimal(10, 2); + byte[] bytes = decType.convertColumnVectorToByte(col, 0); + assertEquals(Long.BYTES, bytes.length); + assertEquals(12345L, ByteBuffer.wrap(bytes).getLong()); + } + + @Test + public void testConvertColumnVectorToByte_boolean() + { + LongColumnVector col = new LongColumnVector(2); + col.vector[0] = 1; + col.vector[1] = 0; + + byte[] bytes0 = TypeDescription.createBoolean().convertColumnVectorToByte(col, 0); + assertEquals(1, bytes0.length); + assertEquals(1, bytes0[0]); + + byte[] bytes1 = TypeDescription.createBoolean().convertColumnVectorToByte(col, 1); + assertEquals(1, bytes1.length); + assertEquals(0, bytes1[0]); + } +} diff --git a/pixels-core/src/test/java/io/pixelsdb/pixels/core/reader/TestVisibilityCheckpointCache.java b/pixels-core/src/test/java/io/pixelsdb/pixels/core/reader/TestVisibilityCheckpointCache.java index ed1bcbe973..e4ca0e3040 100644 --- a/pixels-core/src/test/java/io/pixelsdb/pixels/core/reader/TestVisibilityCheckpointCache.java +++ b/pixels-core/src/test/java/io/pixelsdb/pixels/core/reader/TestVisibilityCheckpointCache.java @@ -46,7 +46,7 @@ public class TestVisibilityCheckpointCache @Before public void setUp() throws IOException { - testCheckpointDir = ConfigFactory.Instance().getProperty("pixels.retina.checkpoint.dir"); + testCheckpointDir = ConfigFactory.Instance().getProperty("retina.checkpoint.dir"); storage = StorageFactory.Instance().getStorage(testCheckpointDir); if (!storage.exists(testCheckpointDir)) diff --git a/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/metadata/MetadataServiceImpl.java b/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/metadata/MetadataServiceImpl.java index 64847ed4df..5b65dd637e 100644 --- a/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/metadata/MetadataServiceImpl.java +++ b/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/metadata/MetadataServiceImpl.java @@ -1491,6 +1491,52 @@ public void deleteFiles(MetadataProto.DeleteFilesRequest request, responseObserver.onCompleted(); } + @Override + public void getFileById(MetadataProto.GetFileByIdRequest request, + StreamObserver responseObserver) + { + MetadataProto.ResponseHeader.Builder headerBuilder = MetadataProto.ResponseHeader.newBuilder() + .setToken(request.getHeader().getToken()); + + MetadataProto.GetFileByIdResponse.Builder responseBuilder = MetadataProto.GetFileByIdResponse.newBuilder(); + MetadataProto.File file = this.fileDao.getById(request.getFileId()); + if (file != null) + { + headerBuilder.setErrorCode(SUCCESS).setErrorMsg(""); + responseBuilder.setFile(file).setHeader(headerBuilder); + } + else + { + headerBuilder.setErrorCode(METADATA_GET_FILE_BY_ID_FAILED).setErrorMsg("get file by id failed"); + responseBuilder.setHeader(headerBuilder); + } + + responseObserver.onNext(responseBuilder.build()); + responseObserver.onCompleted(); + } + + @Override + public void atomicSwapFiles(MetadataProto.AtomicSwapFilesRequest request, + StreamObserver responseObserver) + { + MetadataProto.ResponseHeader.Builder headerBuilder = MetadataProto.ResponseHeader.newBuilder() + .setToken(request.getHeader().getToken()); + + if (this.fileDao.atomicSwapFiles(request.getNewFileId(), request.getOldFileIdsList())) + { + headerBuilder.setErrorCode(SUCCESS).setErrorMsg(""); + } + else + { + headerBuilder.setErrorCode(METADATA_ATOMIC_SWAP_FILES_FAILED).setErrorMsg("atomic swap files failed"); + } + + MetadataProto.AtomicSwapFilesResponse response = MetadataProto.AtomicSwapFilesResponse.newBuilder() + .setHeader(headerBuilder).build(); + responseObserver.onNext(response); + responseObserver.onCompleted(); + } + @Override public void createPeerPath(MetadataProto.CreatePeerPathRequest request, StreamObserver responseObserver) diff --git a/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/metadata/dao/FileDao.java b/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/metadata/dao/FileDao.java index c3e9d2a697..73b921008b 100644 --- a/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/metadata/dao/FileDao.java +++ b/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/metadata/dao/FileDao.java @@ -73,4 +73,12 @@ public boolean save (MetadataProto.File file) abstract public boolean update (MetadataProto.File file); abstract public boolean deleteByIds (List ids); + + /** + * Atomically promote a TEMPORARY file to REGULAR and delete the old files in a single transaction. + * @param newFileId the id of the new TEMPORARY file to promote + * @param oldFileIds the ids of old files to delete + * @return true if the transaction committed successfully + */ + abstract public boolean atomicSwapFiles(long newFileId, List oldFileIds); } diff --git a/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/metadata/dao/impl/RdbFileDao.java b/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/metadata/dao/impl/RdbFileDao.java index dc6507ca5d..1af30d564b 100644 --- a/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/metadata/dao/impl/RdbFileDao.java +++ b/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/metadata/dao/impl/RdbFileDao.java @@ -28,6 +28,7 @@ import java.sql.*; import java.util.ArrayList; import java.util.List; +import java.util.stream.Collectors; /** * @author hank @@ -269,4 +270,52 @@ public boolean deleteByIds(List ids) return false; } + + @Override + public boolean atomicSwapFiles(long newFileId, List oldFileIds) + { + Connection conn = db.getConnection(); + try + { + conn.setAutoCommit(false); + try (PreparedStatement pst = conn.prepareStatement( + "UPDATE FILES SET FILE_TYPE=? WHERE FILE_ID=?")) + { + pst.setInt(1, MetadataProto.File.Type.REGULAR.getNumber()); + pst.setLong(2, newFileId); + pst.executeUpdate(); + } + String inClause = oldFileIds.stream().map(id -> "?").collect(Collectors.joining(",")); + try (PreparedStatement pst = conn.prepareStatement( + "DELETE FROM FILES WHERE FILE_ID IN (" + inClause + ")")) + { + for (int i = 0; i < oldFileIds.size(); i++) + { + pst.setLong(i + 1, oldFileIds.get(i)); + } + pst.executeUpdate(); + } + conn.commit(); + return true; + } catch (SQLException e) + { + try + { + conn.rollback(); + } catch (SQLException ignored) + { + } + log.error("atomicSwapFiles in RdbFileDao", e); + } finally + { + try + { + conn.setAutoCommit(true); + } catch (SQLException ignored) + { + } + } + + return false; + } } diff --git a/pixels-example/src/main/java/io/pixelsdb/pixels/example/core/TestPixelsReader.java b/pixels-example/src/main/java/io/pixelsdb/pixels/example/core/TestPixelsReader.java index a7ec284f64..0f268e06aa 100644 --- a/pixels-example/src/main/java/io/pixelsdb/pixels/example/core/TestPixelsReader.java +++ b/pixels-example/src/main/java/io/pixelsdb/pixels/example/core/TestPixelsReader.java @@ -21,74 +21,588 @@ import io.pixelsdb.pixels.common.physical.Storage; import io.pixelsdb.pixels.common.physical.StorageFactory; -import io.pixelsdb.pixels.core.PixelsFooterCache; -import io.pixelsdb.pixels.core.PixelsReader; -import io.pixelsdb.pixels.core.PixelsReaderImpl; -import io.pixelsdb.pixels.core.TypeDescription; +import io.pixelsdb.pixels.core.*; +import io.pixelsdb.pixels.core.encoding.EncodingLevel; import io.pixelsdb.pixels.core.reader.PixelsReaderOption; import io.pixelsdb.pixels.core.reader.PixelsRecordReader; +import io.pixelsdb.pixels.core.utils.Bitmap; +import io.pixelsdb.pixels.core.vector.LongColumnVector; import io.pixelsdb.pixels.core.vector.VectorizedRowBatch; import java.io.IOException; -import java.util.List; +import java.nio.file.Files; +import java.nio.file.Paths; /** - * @author hank + * Tests for hidden-timestamp-column expose/filter logic in PixelsRecordReaderImpl. + * + * Covers: + * T1 empty projection + expose only (no filter) + * T2 empty projection + filter + expose + * T3 with projection + filter + expose + * T4 with projection + expose only (no filter) + * T5 multi-RG stride correctness (small batch across RG boundaries) + * T6 no hidden column + valid transTimestamp (must not crash) + * T7 empty projection + filter + no expose (count(*) with timestamp filter) + * T8 expose + no-hidden-column file (must throw IOException) + * T9 filter + no expose (hiddenColumnVector must be null) + * T10 VectorizedRowBatch.applyFilter syncs hiddenColumnVector */ public class TestPixelsReader { - public static void main(String[] args) - { - // Note you may need to restart intellij to let it pick up the updated environment variable value - // example path: s3://bucket-name/test-file.pxl - String currentPath = System.getenv("PIXELS_S3_TEST_BUCKET_PATH") + "test.pxl"; - System.out.println(currentPath); - try { - Storage storage = StorageFactory.Instance().getStorage("s3"); - PixelsReader reader = PixelsReaderImpl.newBuilder() - .setStorage(storage) - .setPath(currentPath) - .setPixelsFooterCache(new PixelsFooterCache()) - .build(); - - TypeDescription schema = reader.getFileSchema(); - List fieldNames = schema.getFieldNames(); - System.out.println("fieldNames: " + fieldNames); - String[] cols = new String[fieldNames.size()]; - for (int i = 0; i < fieldNames.size(); i++) { - cols[i] = fieldNames.get(i); + private static final String TEST_DIR = "/tmp/pixels_test_reader/"; + private static final String FILE_WITH_HIDDEN = TEST_DIR + "with_hidden.pxl"; + private static final String FILE_NO_HIDDEN = TEST_DIR + "no_hidden.pxl"; + private static final String SCHEMA_STR = "struct"; + + private static final int ROWS_PER_BATCH = 10; + private static final int NUM_BATCHES = 3; + private static final int TOTAL_ROWS = ROWS_PER_BATCH * NUM_BATCHES; // 30 + + private static final long FILTER_TIMESTAMP = 150; + // timestamps are 10,20,...,300 → rows 0-14 have ts <= 150 → 15 rows pass + private static final int EXPECTED_FILTERED_ROWS = 15; + + private static Storage storage; + + public static void main(String[] args) throws Exception + { + storage = StorageFactory.Instance().getStorage("file"); + setup(); + try + { + writeFileWithHiddenColumn(); + writeFileWithoutHiddenColumn(); + + testT1_EmptyProjection_ExposeOnly(); + testT2_EmptyProjection_FilterAndExpose(); + testT3_WithProjection_FilterAndExpose(); + testT4_WithProjection_ExposeOnly(); + testT5_MultiRG_StrideCorrectness(); + testT6_NoHiddenColumn_ValidTimestamp(); + testT7_EmptyProjection_FilterNoExpose(); + testT8_ExposeNoHiddenColumn_ThrowsIOException(); + testT9_FilterNoExpose_HiddenVectorNull(); + testT10_ApplyFilter_SyncsHiddenColumn(); + + System.out.println("\n=== All 10 tests passed! ==="); + } finally + { + cleanup(); + } + } + + // ======================= helpers ======================= + + private static void setup() throws IOException + { + Files.createDirectories(Paths.get(TEST_DIR)); + Files.deleteIfExists(Paths.get(FILE_WITH_HIDDEN)); + Files.deleteIfExists(Paths.get(FILE_NO_HIDDEN)); + } + + private static void cleanup() throws IOException + { + Files.deleteIfExists(Paths.get(FILE_WITH_HIDDEN)); + Files.deleteIfExists(Paths.get(FILE_NO_HIDDEN)); + } + + private static void check(boolean condition, String message) + { + if (!condition) + { + throw new AssertionError(message); + } + } + + private static long expectedTimestamp(int globalRowIdx) + { + return (globalRowIdx + 1) * 10L; + } + + // ======================= write test files ======================= + + private static void writeFileWithHiddenColumn() throws Exception + { + TypeDescription schema = TypeDescription.fromString(SCHEMA_STR); + PixelsWriter writer = PixelsWriterImpl.newBuilder() + .setSchema(schema) + .setHasHiddenColumn(true) + .setPixelStride(10) + .setRowGroupSize(1) // minimal → forces every batch into its own row group + .setStorage(storage) + .setPath(FILE_WITH_HIDDEN) + .setBlockSize(256 * 1024) + .setReplication((short) 1) + .setBlockPadding(false) + .setEncodingLevel(EncodingLevel.EL2) + .setCompressionBlockSize(1) + .setNullsPadding(false) + .build(); + + for (int batch = 0; batch < NUM_BATCHES; batch++) + { + VectorizedRowBatch rowBatch = schema.createRowBatchWithHiddenColumn(); + LongColumnVector x = (LongColumnVector) rowBatch.cols[0]; + LongColumnVector y = (LongColumnVector) rowBatch.cols[1]; + LongColumnVector hidden = (LongColumnVector) rowBatch.cols[2]; + + for (int i = 0; i < ROWS_PER_BATCH; i++) + { + int g = batch * ROWS_PER_BATCH + i; + int row = rowBatch.size++; + x.vector[row] = g * 100L; + x.isNull[row] = false; + y.vector[row] = g * 200L; + y.isNull[row] = false; + hidden.vector[row] = expectedTimestamp(g); + hidden.isNull[row] = false; + } + writer.addRowBatch(rowBatch); + } + writer.close(); + System.out.println("Written file WITH hidden column: " + FILE_WITH_HIDDEN); + } + + private static void writeFileWithoutHiddenColumn() throws Exception + { + TypeDescription schema = TypeDescription.fromString(SCHEMA_STR); + PixelsWriter writer = PixelsWriterImpl.newBuilder() + .setSchema(schema) + .setHasHiddenColumn(false) + .setPixelStride(10000) + .setRowGroupSize(64 * 1024 * 1024) + .setStorage(storage) + .setPath(FILE_NO_HIDDEN) + .setBlockSize(256 * 1024) + .setReplication((short) 1) + .setBlockPadding(false) + .setEncodingLevel(EncodingLevel.EL2) + .setCompressionBlockSize(1) + .setNullsPadding(false) + .build(); + + VectorizedRowBatch rowBatch = schema.createRowBatch(ROWS_PER_BATCH); + LongColumnVector x = (LongColumnVector) rowBatch.cols[0]; + LongColumnVector y = (LongColumnVector) rowBatch.cols[1]; + for (int i = 0; i < ROWS_PER_BATCH; i++) + { + int row = rowBatch.size++; + x.vector[row] = i * 100L; + x.isNull[row] = false; + y.vector[row] = i * 200L; + y.isNull[row] = false; + } + writer.addRowBatch(rowBatch); + writer.close(); + System.out.println("Written file WITHOUT hidden column: " + FILE_NO_HIDDEN); + } + + // ======================= tests ======================= + + /** + * T1: empty projection + expose only (no filter). + * Enters else-branch, reads hidden column directly into hiddenColumnVector. + * All 30 rows returned with correct timestamps. + */ + private static void testT1_EmptyProjection_ExposeOnly() throws Exception + { + System.out.println("\n--- T1: Empty projection + expose only (no filter) ---"); + PixelsReader reader = PixelsReaderImpl.newBuilder() + .setStorage(storage).setPath(FILE_WITH_HIDDEN) + .setPixelsFooterCache(new PixelsFooterCache()).build(); + + PixelsReaderOption option = new PixelsReaderOption(); + option.includeCols(new String[0]); + option.exposeHiddenColumn(true); + option.skipCorruptRecords(true); + option.tolerantSchemaEvolution(true); + + PixelsRecordReader rr = reader.read(option); + VectorizedRowBatch batch = rr.readBatch(TOTAL_ROWS + 100); + + check(batch.cols.length == 0, "T1: expected 0 user columns, got " + batch.cols.length); + check(batch.size == TOTAL_ROWS, "T1: expected " + TOTAL_ROWS + " rows, got " + batch.size); + LongColumnVector hv = batch.getHiddenColumnVector(); + check(hv != null, "T1: hiddenColumnVector should not be null"); + for (int i = 0; i < TOTAL_ROWS; i++) + { + check(hv.vector[i] == expectedTimestamp(i), + "T1: row " + i + " ts expected " + expectedTimestamp(i) + ", got " + hv.vector[i]); + } + rr.close(); + reader.close(); + System.out.println("T1 passed!"); + } + + /** + * T2: empty projection + filter + expose. + * Enters if-branch, filters by timestamp, copies selected timestamps. + */ + private static void testT2_EmptyProjection_FilterAndExpose() throws Exception + { + System.out.println("\n--- T2: Empty projection + filter + expose ---"); + PixelsReader reader = PixelsReaderImpl.newBuilder() + .setStorage(storage).setPath(FILE_WITH_HIDDEN) + .setPixelsFooterCache(new PixelsFooterCache()).build(); + + PixelsReaderOption option = new PixelsReaderOption(); + option.includeCols(new String[0]); + option.exposeHiddenColumn(true); + option.transId(0); + option.transTimestamp(FILTER_TIMESTAMP); + option.skipCorruptRecords(true); + option.tolerantSchemaEvolution(true); + + PixelsRecordReader rr = reader.read(option); + VectorizedRowBatch batch = rr.readBatch(TOTAL_ROWS + 100); + + check(batch.size == EXPECTED_FILTERED_ROWS, + "T2: expected " + EXPECTED_FILTERED_ROWS + " rows, got " + batch.size); + check(batch.cols.length == 0, "T2: expected 0 user columns, got " + batch.cols.length); + LongColumnVector hv = batch.getHiddenColumnVector(); + check(hv != null, "T2: hiddenColumnVector should not be null"); + for (int i = 0; i < batch.size; i++) + { + check(hv.vector[i] == expectedTimestamp(i), + "T2: row " + i + " ts expected " + expectedTimestamp(i) + ", got " + hv.vector[i]); + check(hv.vector[i] <= FILTER_TIMESTAMP, + "T2: row " + i + " ts " + hv.vector[i] + " should be <= " + FILTER_TIMESTAMP); + } + rr.close(); + reader.close(); + System.out.println("T2 passed!"); + } + + /** + * T3: with projection + filter + expose. + * Verifies user columns and hidden column are row-aligned after filtering. + */ + private static void testT3_WithProjection_FilterAndExpose() throws Exception + { + System.out.println("\n--- T3: With projection + filter + expose ---"); + PixelsReader reader = PixelsReaderImpl.newBuilder() + .setStorage(storage).setPath(FILE_WITH_HIDDEN) + .setPixelsFooterCache(new PixelsFooterCache()).build(); + + PixelsReaderOption option = new PixelsReaderOption(); + option.includeCols(new String[]{"x", "y"}); + option.exposeHiddenColumn(true); + option.transId(0); + option.transTimestamp(FILTER_TIMESTAMP); + option.skipCorruptRecords(true); + option.tolerantSchemaEvolution(true); + + PixelsRecordReader rr = reader.read(option); + VectorizedRowBatch batch = rr.readBatch(TOTAL_ROWS + 100); + + check(batch.size == EXPECTED_FILTERED_ROWS, + "T3: expected " + EXPECTED_FILTERED_ROWS + " rows, got " + batch.size); + check(batch.cols.length == 2, "T3: expected 2 user columns, got " + batch.cols.length); + + LongColumnVector xCol = (LongColumnVector) batch.cols[0]; + LongColumnVector yCol = (LongColumnVector) batch.cols[1]; + LongColumnVector hv = batch.getHiddenColumnVector(); + check(hv != null, "T3: hiddenColumnVector should not be null"); + + for (int i = 0; i < batch.size; i++) + { + check(xCol.vector[i] == i * 100L, + "T3: row " + i + " x expected " + (i * 100L) + ", got " + xCol.vector[i]); + check(yCol.vector[i] == i * 200L, + "T3: row " + i + " y expected " + (i * 200L) + ", got " + yCol.vector[i]); + check(hv.vector[i] == expectedTimestamp(i), + "T3: row " + i + " ts expected " + expectedTimestamp(i) + ", got " + hv.vector[i]); + } + rr.close(); + reader.close(); + System.out.println("T3 passed!"); + } + + /** + * T4: with projection + expose only (no filter). + * Enters else-branch, reads all rows with user columns and hidden column. + */ + private static void testT4_WithProjection_ExposeOnly() throws Exception + { + System.out.println("\n--- T4: With projection + expose only (no filter) ---"); + PixelsReader reader = PixelsReaderImpl.newBuilder() + .setStorage(storage).setPath(FILE_WITH_HIDDEN) + .setPixelsFooterCache(new PixelsFooterCache()).build(); + + PixelsReaderOption option = new PixelsReaderOption(); + option.includeCols(new String[]{"x"}); + option.exposeHiddenColumn(true); + option.skipCorruptRecords(true); + option.tolerantSchemaEvolution(true); + + PixelsRecordReader rr = reader.read(option); + VectorizedRowBatch batch = rr.readBatch(TOTAL_ROWS + 100); + + check(batch.size == TOTAL_ROWS, "T4: expected " + TOTAL_ROWS + " rows, got " + batch.size); + check(batch.cols.length == 1, "T4: expected 1 user column, got " + batch.cols.length); + + LongColumnVector xCol = (LongColumnVector) batch.cols[0]; + LongColumnVector hv = batch.getHiddenColumnVector(); + check(hv != null, "T4: hiddenColumnVector should not be null"); + + for (int i = 0; i < batch.size; i++) + { + check(xCol.vector[i] == i * 100L, + "T4: row " + i + " x expected " + (i * 100L) + ", got " + xCol.vector[i]); + check(hv.vector[i] == expectedTimestamp(i), + "T4: row " + i + " ts expected " + expectedTimestamp(i) + ", got " + hv.vector[i]); + } + rr.close(); + reader.close(); + System.out.println("T4 passed!"); + } + + /** + * T5: multi-RG stride correctness. + * Uses a small batch size (7) so that readBatch must cross RG boundaries. + * Reads with filter (if-branch) to exercise the fixed stride computation. + * Verifies every row's data is correct across all RGs. + */ + private static void testT5_MultiRG_StrideCorrectness() throws Exception + { + System.out.println("\n--- T5: Multi-RG stride correctness ---"); + PixelsReader reader = PixelsReaderImpl.newBuilder() + .setStorage(storage).setPath(FILE_WITH_HIDDEN) + .setPixelsFooterCache(new PixelsFooterCache()).build(); + + int rgNum = reader.getRowGroupNum(); + System.out.println(" Row groups in file: " + rgNum); + check(rgNum >= 2, "T5: expected >= 2 row groups, got " + rgNum); + + PixelsReaderOption option = new PixelsReaderOption(); + option.includeCols(new String[]{"x", "y"}); + option.exposeHiddenColumn(true); + option.transId(0); + option.transTimestamp(Long.MAX_VALUE); // accept all rows, but still enters if-branch + option.skipCorruptRecords(true); + option.tolerantSchemaEvolution(true); + + PixelsRecordReader rr = reader.read(option); + int totalRead = 0; + while (true) + { + VectorizedRowBatch batch = rr.readBatch(7); // odd size to cross RG boundaries + if (batch.size == 0 && batch.endOfFile) + { + break; } - PixelsReaderOption option = new PixelsReaderOption(); - option.skipCorruptRecords(true); - option.tolerantSchemaEvolution(true); - option.includeCols(cols); - PixelsRecordReader recordReader = reader.read(option); - System.out.println("recordReader.getCompletedRows():" + recordReader.getCompletedRows()); - System.out.println("reader.getRowGroupInfo(0).getNumberOfRows():" + reader.getRowGroupInfo(0).getNumberOfRows()); - int batchSize = 10000; - VectorizedRowBatch rowBatch; - int len = 0; - int numRows = 0; - int numBatches = 0; - while (true) { - rowBatch = recordReader.readBatch(batchSize); - System.out.println("rowBatch: " + rowBatch); - numBatches++; - String result = rowBatch.toString(); - len += result.length(); - System.out.println("loop:" + numBatches + ", rowBatchSize:" + rowBatch.size); - if (rowBatch.endOfFile) { - numRows += rowBatch.size; - break; - } - numRows += rowBatch.size; + LongColumnVector xCol = (LongColumnVector) batch.cols[0]; + LongColumnVector hv = batch.getHiddenColumnVector(); + check(hv != null, "T5: hiddenColumnVector should not be null in batch starting at row " + totalRead); + + for (int i = 0; i < batch.size; i++) + { + int g = totalRead + i; + check(xCol.vector[i] == g * 100L, + "T5: globalRow " + g + " x expected " + (g * 100L) + ", got " + xCol.vector[i]); + check(hv.vector[i] == expectedTimestamp(g), + "T5: globalRow " + g + " ts expected " + expectedTimestamp(g) + ", got " + hv.vector[i]); + } + totalRead += batch.size; + if (batch.endOfFile) + { + break; } - System.out.println("numBatches:" + numBatches + ", numRows:" + numRows); - reader.close(); - } catch (IOException e) { - System.out.println("Err path: " + currentPath.toString()); - e.printStackTrace(); } + check(totalRead == TOTAL_ROWS, "T5: expected " + TOTAL_ROWS + " total rows, got " + totalRead); + rr.close(); + reader.close(); + System.out.println("T5 passed! (read " + totalRead + " rows across multiple RGs)"); + } + + /** + * T6: no hidden column + valid transTimestamp. + * After the fix, this enters the else-branch (no filtering needed). + * Must not crash, data must be correct. + */ + private static void testT6_NoHiddenColumn_ValidTimestamp() throws Exception + { + System.out.println("\n--- T6: No hidden column + valid transTimestamp ---"); + PixelsReader reader = PixelsReaderImpl.newBuilder() + .setStorage(storage).setPath(FILE_NO_HIDDEN) + .setPixelsFooterCache(new PixelsFooterCache()).build(); + + PixelsReaderOption option = new PixelsReaderOption(); + option.includeCols(new String[]{"x", "y"}); + option.transId(0); + option.transTimestamp(100); + option.skipCorruptRecords(true); + option.tolerantSchemaEvolution(true); + + PixelsRecordReader rr = reader.read(option); + VectorizedRowBatch batch = rr.readBatch(100); + + check(batch.size == ROWS_PER_BATCH, + "T6: expected " + ROWS_PER_BATCH + " rows, got " + batch.size); + check(batch.getHiddenColumnVector() == null, + "T6: hiddenColumnVector should be null (exposeHiddenColumn not set)"); + + LongColumnVector xCol = (LongColumnVector) batch.cols[0]; + for (int i = 0; i < batch.size; i++) + { + check(xCol.vector[i] == i * 100L, + "T6: row " + i + " x expected " + (i * 100L) + ", got " + xCol.vector[i]); + } + rr.close(); + reader.close(); + System.out.println("T6 passed!"); + } + + /** + * T7: empty projection + filter + no expose. + * Equivalent to count(*) with timestamp filter. + * filterByHiddenTimestamp=true, needReadHiddenColumn=true, exposeHiddenColumn=false. + * Enters if-branch, filters rows, but hiddenColumnVector stays null. + */ + private static void testT7_EmptyProjection_FilterNoExpose() throws Exception + { + System.out.println("\n--- T7: Empty projection + filter + no expose (filtered count(*)) ---"); + PixelsReader reader = PixelsReaderImpl.newBuilder() + .setStorage(storage).setPath(FILE_WITH_HIDDEN) + .setPixelsFooterCache(new PixelsFooterCache()).build(); + + PixelsReaderOption option = new PixelsReaderOption(); + option.includeCols(new String[0]); + option.transId(0); + option.transTimestamp(FILTER_TIMESTAMP); + option.skipCorruptRecords(true); + option.tolerantSchemaEvolution(true); + + PixelsRecordReader rr = reader.read(option); + VectorizedRowBatch batch = rr.readBatch(TOTAL_ROWS + 100); + + check(batch.cols.length == 0, "T7: expected 0 user columns, got " + batch.cols.length); + check(batch.size == EXPECTED_FILTERED_ROWS, + "T7: expected " + EXPECTED_FILTERED_ROWS + " rows, got " + batch.size); + check(batch.getHiddenColumnVector() == null, + "T7: hiddenColumnVector should be null when exposeHiddenColumn is false"); + rr.close(); + reader.close(); + System.out.println("T7 passed!"); + } + + /** + * T8: exposeHiddenColumn=true on a file without hidden column. + * Constructor must throw IOException. + */ + private static void testT8_ExposeNoHiddenColumn_ThrowsIOException() throws Exception + { + System.out.println("\n--- T8: Expose + no hidden column file → IOException ---"); + PixelsReader reader = PixelsReaderImpl.newBuilder() + .setStorage(storage).setPath(FILE_NO_HIDDEN) + .setPixelsFooterCache(new PixelsFooterCache()).build(); + + PixelsReaderOption option = new PixelsReaderOption(); + option.includeCols(new String[]{"x"}); + option.exposeHiddenColumn(true); + option.skipCorruptRecords(true); + option.tolerantSchemaEvolution(true); + + try + { + reader.read(option); + check(false, "T8: should have thrown IOException"); + } catch (IOException e) + { + check(e.getMessage().contains("no hidden column"), + "T8: exception should mention 'no hidden column', got: " + e.getMessage()); + System.out.println(" Caught expected IOException: " + e.getMessage()); + } + reader.close(); + System.out.println("T8 passed!"); + } + + /** + * T9: filter + no expose (original behavior regression). + * hiddenColumnVector must be null in the result batch. + */ + private static void testT9_FilterNoExpose_HiddenVectorNull() throws Exception + { + System.out.println("\n--- T9: Filter + no expose → hiddenColumnVector null ---"); + PixelsReader reader = PixelsReaderImpl.newBuilder() + .setStorage(storage).setPath(FILE_WITH_HIDDEN) + .setPixelsFooterCache(new PixelsFooterCache()).build(); + + PixelsReaderOption option = new PixelsReaderOption(); + option.includeCols(new String[]{"x", "y"}); + option.transId(0); + option.transTimestamp(FILTER_TIMESTAMP); + option.skipCorruptRecords(true); + option.tolerantSchemaEvolution(true); + // exposeHiddenColumn defaults to false + + PixelsRecordReader rr = reader.read(option); + VectorizedRowBatch batch = rr.readBatch(TOTAL_ROWS + 100); + + check(batch.size == EXPECTED_FILTERED_ROWS, + "T9: expected " + EXPECTED_FILTERED_ROWS + " rows, got " + batch.size); + check(batch.getHiddenColumnVector() == null, + "T9: hiddenColumnVector should be null when exposeHiddenColumn is false"); + + LongColumnVector xCol = (LongColumnVector) batch.cols[0]; + for (int i = 0; i < batch.size; i++) + { + check(xCol.vector[i] == i * 100L, + "T9: row " + i + " x expected " + (i * 100L) + ", got " + xCol.vector[i]); + } + rr.close(); + reader.close(); + System.out.println("T9 passed!"); + } + + /** + * T10: VectorizedRowBatch.applyFilter must sync hiddenColumnVector. + * Read all rows with expose, then apply an external filter keeping even-indexed rows. + */ + private static void testT10_ApplyFilter_SyncsHiddenColumn() throws Exception + { + System.out.println("\n--- T10: applyFilter syncs hiddenColumnVector ---"); + PixelsReader reader = PixelsReaderImpl.newBuilder() + .setStorage(storage).setPath(FILE_WITH_HIDDEN) + .setPixelsFooterCache(new PixelsFooterCache()).build(); + + PixelsReaderOption option = new PixelsReaderOption(); + option.includeCols(new String[]{"x"}); + option.exposeHiddenColumn(true); + option.skipCorruptRecords(true); + option.tolerantSchemaEvolution(true); + + PixelsRecordReader rr = reader.read(option); + VectorizedRowBatch batch = rr.readBatch(TOTAL_ROWS + 100); + check(batch.size == TOTAL_ROWS, "T10: expected " + TOTAL_ROWS + " rows before filter"); + + Bitmap filter = new Bitmap(TOTAL_ROWS, false); + for (int i = 0; i < TOTAL_ROWS; i += 2) + { + filter.set(i); // keep even-indexed rows: 0, 2, 4, ... + } + batch.applyFilter(filter); + + int expectedAfterFilter = (TOTAL_ROWS + 1) / 2; // ceil(30/2) = 15 + check(batch.size == expectedAfterFilter, + "T10: expected " + expectedAfterFilter + " rows after filter, got " + batch.size); + + LongColumnVector xCol = (LongColumnVector) batch.cols[0]; + LongColumnVector hv = batch.getHiddenColumnVector(); + check(hv != null, "T10: hiddenColumnVector should survive applyFilter"); + + for (int i = 0; i < batch.size; i++) + { + int originalIdx = i * 2; + check(xCol.vector[i] == originalIdx * 100L, + "T10: filtered row " + i + " x expected " + (originalIdx * 100L) + ", got " + xCol.vector[i]); + check(hv.vector[i] == expectedTimestamp(originalIdx), + "T10: filtered row " + i + " ts expected " + expectedTimestamp(originalIdx) + ", got " + hv.vector[i]); + } + rr.close(); + reader.close(); + System.out.println("T10 passed!"); } } diff --git a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/FileWriterManager.java b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/FileWriterManager.java index 8a4960e1e7..f470cb728e 100644 --- a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/FileWriterManager.java +++ b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/FileWriterManager.java @@ -76,7 +76,7 @@ public FileWriterManager(long tableId, TypeDescription schema, this.virtualNodeId = virtualNodeId; // Create pixels writer. - String targetFileName = PixelsFileNameUtils.buildRetinaFileName(hostName, virtualNodeId); + String targetFileName = PixelsFileNameUtils.buildOrderedFileName(hostName, virtualNodeId); String targetFilePath = targetOrderedDirPath.getUri() + "/" + targetFileName; try { @@ -97,7 +97,7 @@ public FileWriterManager(long tableId, TypeDescription schema, // Add the corresponding visibility for the file. RetinaResourceManager retinaResourceManager = RetinaResourceManager.Instance(); - retinaResourceManager.addVisibility(this.file.getId(), 0, recordNum); + retinaResourceManager.addVisibility(this.file.getId(), 0, recordNum, 0L, null, false); try { diff --git a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RGVisibility.java b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RGVisibility.java index a986717f93..1816f262d5 100644 --- a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RGVisibility.java +++ b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RGVisibility.java @@ -65,22 +65,14 @@ public class RGVisibility implements AutoCloseable private final AtomicLong nativeHandle = new AtomicLong(); private final long recordNum; - public RGVisibility(long rgRecordNum) - { - this.recordNum = rgRecordNum; - this.nativeHandle.set(createNativeObject(rgRecordNum)); - } - public RGVisibility(long rgRecordNum, long timestamp, long[] initialBitmap) { - this.recordNum = rgRecordNum; - if (initialBitmap == null) - { - this.nativeHandle.set(createNativeObject(rgRecordNum)); - } else + if (timestamp < 0) { - this.nativeHandle.set(createNativeObjectInitialized(rgRecordNum, timestamp, initialBitmap)); + throw new IllegalArgumentException("timestamp must not be negative"); } + this.recordNum = rgRecordNum; + this.nativeHandle.set(createNativeObject(rgRecordNum, timestamp, initialBitmap)); } public long getRecordNum() @@ -99,12 +91,13 @@ public void close() } // native methods - private native long createNativeObject(long rgRecordNum); - private native long createNativeObjectInitialized(long rgRecordNum, long timestamp, long[] bitmap); + private native long createNativeObject(long rgRecordNum, long timestamp, long[] bitmap); private native void destroyNativeObject(long nativeHandle); private native void deleteRecord(int rgRowOffset, long timestamp, long nativeHandle); private native long[] getVisibilityBitmap(long timestamp, long nativeHandle); - private native void garbageCollect(long timestamp, long nativeHandle); + private native long[] garbageCollect(long timestamp, long nativeHandle); + private native long[] exportChainItemsAfter(long safeGcTs, long nativeHandle); + private native void importDeletionChain(long[] items, long nativeHandle); private static native long getNativeMemoryUsage(); private static native long getRetinaTrackedMemoryUsage(); private static native long getRetinaObjectCount(); @@ -132,7 +125,7 @@ public long[] getVisibilityBitmap(long timestamp) return bitmap; } - public void garbageCollect(long timestamp) + public long[] garbageCollect(long timestamp) { long handle = this.nativeHandle.get(); if (handle == 0) @@ -140,7 +133,27 @@ public void garbageCollect(long timestamp) throw new IllegalStateException("RGVisibility instance has been closed."); } - garbageCollect(timestamp, handle); + return garbageCollect(timestamp, handle); + } + + public long[] exportChainItemsAfter(long safeGcTs) + { + long handle = this.nativeHandle.get(); + if (handle == 0) + { + throw new IllegalStateException("RGVisibility instance has been closed."); + } + return exportChainItemsAfter(safeGcTs, handle); + } + + public void importDeletionChain(long[] items) + { + long handle = this.nativeHandle.get(); + if (handle == 0) + { + throw new IllegalStateException("RGVisibility instance has been closed."); + } + importDeletionChain(items, handle); } /** diff --git a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RetinaResourceManager.java b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RetinaResourceManager.java index 56b78883e8..6d4fb94bbf 100644 --- a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RetinaResourceManager.java +++ b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RetinaResourceManager.java @@ -36,6 +36,7 @@ import io.pixelsdb.pixels.common.utils.RetinaUtils; import io.pixelsdb.pixels.core.PixelsProto; import io.pixelsdb.pixels.core.TypeDescription; +import io.pixelsdb.pixels.core.encoding.EncodingLevel; import io.pixelsdb.pixels.index.IndexProto; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -48,6 +49,8 @@ import java.util.*; import java.util.concurrent.*; import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.locks.ReadWriteLock; +import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.stream.Collectors; /** @@ -63,17 +66,51 @@ public class RetinaResourceManager // GC related fields private final ScheduledExecutorService gcExecutor; + private final boolean storageGcEnabled; + private final StorageGarbageCollector storageGarbageCollector; // Checkpoint related fields private final ExecutorService checkpointExecutor; private final Map offloadedCheckpoints; private final Map> checkpointFutures; private final String checkpointDir; - private long latestGcTimestamp = -1; + private volatile long latestGcTimestamp = -1; private final int totalVirtualNodeNum; private final Map checkpointRefCounts; + // Dual-write: oldFileId → result AND newFileId → result in a single map. + // Direction is distinguished by checking fileId == result.newFileId. + private final Map dualWriteLookup = new HashMap<>(); + private final ReadWriteLock redirectionLock = new ReentrantReadWriteLock(); + private volatile boolean isDualWriteActive = false; + + // Delayed cleanup queue for old files retired after atomic swap. + private final ConcurrentLinkedQueue retiredFiles = new ConcurrentLinkedQueue<>(); + + /** + * Metadata for an old file that has been atomically swapped out and awaits + * delayed physical deletion after a configurable wall-clock grace period. + */ + static final class RetiredFile + { + final long fileId; + final int rgCount; + final String filePath; + /** Epoch millis deadline; the file is eligible for deletion after this time. */ + final long retireTimestamp; + final List oldRowIds; + + RetiredFile(long fileId, int rgCount, String filePath, long retireTimestamp, List oldRowIds) + { + this.fileId = fileId; + this.rgCount = rgCount; + this.filePath = filePath; + this.retireTimestamp = retireTimestamp; + this.oldRowIds = oldRowIds; + } + } + private enum CheckpointType { GC, @@ -91,7 +128,7 @@ private RetinaResourceManager() ConfigFactory config = ConfigFactory.Instance(); this.checkpointRefCounts = new ConcurrentHashMap<>(); - this.checkpointDir = config.getProperty("pixels.retina.checkpoint.dir"); + this.checkpointDir = config.getProperty("retina.checkpoint.dir"); int cpThreads = Integer.parseInt(config.getProperty("retina.checkpoint.threads")); this.checkpointExecutor = Executors.newFixedThreadPool(cpThreads, r -> { @@ -124,6 +161,37 @@ private RetinaResourceManager() this.gcExecutor = executor; totalVirtualNodeNum = Integer.parseInt(ConfigFactory.Instance().getProperty("node.virtual.num")); this.retinaHostName = NetUtils.getLocalHostName(); + + boolean gcEnabled = false; + StorageGarbageCollector gc = null; + try + { + gcEnabled = Boolean.parseBoolean(config.getProperty("retina.storage.gc.enabled")); + if (gcEnabled) + { + double threshold = Double.parseDouble(config.getProperty("retina.storage.gc.threshold")); + long targetFileSize = Long.parseLong(config.getProperty("retina.storage.gc.target.file.size")); + int maxFilesPerGroup = Integer.parseInt(config.getProperty("retina.storage.gc.max.files.per.group")); + int maxGroups = Integer.parseInt(config.getProperty("retina.storage.gc.max.file.groups.per.run")); + int rowGroupSize = Integer.parseInt(config.getProperty("row.group.size")); + EncodingLevel encodingLevel = EncodingLevel.from( + Integer.parseInt(config.getProperty("retina.storage.gc.encoding.level"))); + long retireDelayMs = (long) (Double.parseDouble(config.getProperty("retina.storage.gc.file.retire.delay.hours")) * 3_600_000L); + gc = new StorageGarbageCollector(this, this.metadataService, + threshold, targetFileSize, maxFilesPerGroup, maxGroups, + rowGroupSize, encodingLevel, retireDelayMs); + logger.info("Storage GC enabled (threshold={}, targetFileSize={}, maxFilesPerGroup={}, maxGroups={})", + threshold, targetFileSize, maxFilesPerGroup, maxGroups); + } + } + catch (Exception e) + { + logger.error("Failed to initialise StorageGarbageCollector, Storage GC will be disabled", e); + gcEnabled = false; + gc = null; + } + this.storageGcEnabled = gcEnabled; + this.storageGarbageCollector = gc; } private static final class InstanceHolder @@ -136,15 +204,18 @@ public static RetinaResourceManager Instance() return InstanceHolder.instance; } - public void addVisibility(long fileId, int rgId, int recordNum) + public void addVisibility(long fileId, int rgId, int recordNum, long timestamp, + long[] bitmap, boolean overwrite) { - String rgKey = fileId + "_" + rgId; - if (rgVisibilityMap.containsKey(rgKey)) + String rgKey = RetinaUtils.buildRgKey(fileId, rgId); + if (overwrite) { - return; + rgVisibilityMap.put(rgKey, new RGVisibility(recordNum, timestamp, bitmap)); + } + else + { + rgVisibilityMap.computeIfAbsent(rgKey, k -> new RGVisibility(recordNum, timestamp, bitmap)); } - - rgVisibilityMap.put(rgKey, new RGVisibility(recordNum)); } public void addVisibility(String filePath) throws RetinaException @@ -167,7 +238,7 @@ public void addVisibility(String filePath) throws RetinaException for (int rgId = 0; rgId < footer.getRowGroupInfosCount(); rgId++) { int recordNum = footer.getRowGroupInfos(rgId).getNumberOfRows(); - addVisibility(fileId, rgId, recordNum); + addVisibility(fileId, rgId, recordNum, 0L, null, false); } } } catch (Exception e) @@ -286,10 +357,15 @@ public void unregisterOffload(long timestamp) } private CompletableFuture createCheckpoint(long timestamp, CheckpointType type) throws RetinaException + { + return createCheckpoint(timestamp, type, null); + } + + private CompletableFuture createCheckpoint( + long timestamp, CheckpointType type, Map precomputedBitmaps) throws RetinaException { String prefix = (type == CheckpointType.GC) ? RetinaUtils.CHECKPOINT_PREFIX_GC : RetinaUtils.CHECKPOINT_PREFIX_OFFLOAD; - String fileName = RetinaUtils.getCheckpointFileName(prefix, retinaHostName, timestamp); - String filePath = checkpointDir.endsWith("/") ? checkpointDir + fileName : checkpointDir + "/" + fileName; + String filePath = RetinaUtils.buildCheckpointPath(checkpointDir, prefix, retinaHostName, timestamp); // 1. Capture current entries to ensure we process a consistent set of RGs List> entries = new ArrayList<>(this.rgVisibilityMap.entrySet()); @@ -297,21 +373,26 @@ private CompletableFuture createCheckpoint(long timestamp, CheckpointType logger.info("Starting {} checkpoint for {} RGs at timestamp {}", type, totalRgs, timestamp); // 2. Use a BlockingQueue for producer-consumer pattern - // Limit capacity to avoid excessive memory usage if writing is slow BlockingQueue queue = new LinkedBlockingQueue<>(1024); - // 3. Start producer tasks to fetch bitmaps in parallel + // 3. Start producer tasks to fetch bitmaps for (Map.Entry entry : entries) { checkpointExecutor.submit(() -> { try { String key = entry.getKey(); - String[] parts = key.split("_"); - long fileId = Long.parseLong(parts[0]); - int rgId = Integer.parseInt(parts[1]); + long fileId = RetinaUtils.parseFileIdFromRgKey(key); + int rgId = RetinaUtils.parseRgIdFromRgKey(key); RGVisibility rgVisibility = entry.getValue(); - long[] bitmap = rgVisibility.getVisibilityBitmap(timestamp); + long[] bitmap; + if (precomputedBitmaps != null && precomputedBitmaps.containsKey(key)) + { + bitmap = precomputedBitmaps.get(key); + } else + { + bitmap = rgVisibility.getVisibilityBitmap(timestamp); + } queue.put(new CheckpointFileIO.CheckpointEntry(fileId, rgId, (int) rgVisibility.getRecordNum(), bitmap)); } catch (Exception e) { @@ -320,63 +401,105 @@ private CompletableFuture createCheckpoint(long timestamp, CheckpointType }); } - // 4. Async Write: perform IO in background thread (Consumer) - // Use commonPool to avoid deadlocks with checkpointExecutor + // 4. Async Write: perform IO in background thread (Consumer). + // Use commonPool to avoid deadlocks with checkpointExecutor. + // Concurrency safety: for OFFLOAD type, registerOffload() guarantees at most + // one future per timestamp via synchronized(refCount) + checkpointFutures.computeIfAbsent. + // For GC type, runGC() is single-threaded. No file-level locking is needed here. return CompletableFuture.runAsync(() -> { - // Lock on filePath string intern to ensure only one thread writes to the same file - synchronized (filePath.intern()) + long startWrite = System.currentTimeMillis(); + try { - if (type == CheckpointType.OFFLOAD && offloadedCheckpoints.containsKey(timestamp)) + CheckpointFileIO.writeCheckpoint(filePath, totalRgs, queue); + long endWrite = System.currentTimeMillis(); + logger.info("Writing {} checkpoint file to {} took {} ms", type, filePath, (endWrite - startWrite)); + + if (type == CheckpointType.OFFLOAD) { - return; + offloadedCheckpoints.put(timestamp, filePath); } - if (type == CheckpointType.GC && timestamp <= latestGcTimestamp) + } catch (Exception e) + { + logger.error("Failed to commit {} checkpoint file for timestamp: {}", type, timestamp, e); + try + { + StorageFactory.Instance().getStorage(filePath).delete(filePath, false); + } catch (IOException ignored) { - return; } + throw new CompletionException(e); + } + }); + } - long startWrite = System.currentTimeMillis(); - try + /** + * Writes a checkpoint from pre-built {@link CheckpointFileIO.CheckpointEntry} objects, + * bypassing the {@code rgVisibilityMap} traversal and per-entry thread-pool submission + * that the other {@code createCheckpoint} overload performs. + * + *

    This is used by {@link #runGC()} when the entries have already been constructed + * during the Memory GC single-pass, avoiding a redundant second traversal of + * {@code rgVisibilityMap}. + */ + private CompletableFuture createCheckpointDirect( + long timestamp, CheckpointType type, + List preBuiltEntries) throws RetinaException + { + String prefix = (type == CheckpointType.GC) ? RetinaUtils.CHECKPOINT_PREFIX_GC : RetinaUtils.CHECKPOINT_PREFIX_OFFLOAD; + String filePath = RetinaUtils.buildCheckpointPath(checkpointDir, prefix, retinaHostName, timestamp); + + int totalRgs = preBuiltEntries.size(); + logger.info("Starting {} checkpoint (direct) for {} RGs at timestamp {}", type, totalRgs, timestamp); + + BlockingQueue queue = new LinkedBlockingQueue<>(1024); + + // Feed pre-built entries into the queue via the checkpoint executor so that the + // producer-consumer pattern with the writer thread is preserved (the queue has a + // bounded capacity of 1024, so this may block and must not run on the caller thread). + checkpointExecutor.submit(() -> { + try + { + for (CheckpointFileIO.CheckpointEntry entry : preBuiltEntries) { - // Use CheckpointFileIO for unified write logic - CheckpointFileIO.writeCheckpoint(filePath, totalRgs, queue); - long endWrite = System.currentTimeMillis(); - logger.info("Writing {} checkpoint file to {} took {} ms", type, filePath, (endWrite - startWrite)); + queue.put(entry); + } + } + catch (InterruptedException e) + { + Thread.currentThread().interrupt(); + logger.error("Interrupted while feeding pre-built checkpoint entries", e); + } + }); - if (type == CheckpointType.OFFLOAD) - { - offloadedCheckpoints.put(timestamp, filePath); - } else - { - long oldGcTs = this.latestGcTimestamp; - this.latestGcTimestamp = timestamp; - if (oldGcTs != -1 && oldGcTs != timestamp) - { - removeCheckpointFile(oldGcTs, CheckpointType.GC); - } - } - } catch (Exception e) + return CompletableFuture.runAsync(() -> { + try + { + CheckpointFileIO.writeCheckpoint(filePath, totalRgs, queue); + + if (type == CheckpointType.OFFLOAD) + { + offloadedCheckpoints.put(timestamp, filePath); + } + } + catch (Exception e) + { + logger.error("Failed to commit {} checkpoint file for timestamp: {}", type, timestamp, e); + try + { + StorageFactory.Instance().getStorage(filePath).delete(filePath, false); + } + catch (IOException ignored) { - logger.error("Failed to commit {} checkpoint file for timestamp: {}", type, timestamp, e); - // Try to cleanup the potentially corrupted or partial file - try - { - StorageFactory.Instance().getStorage(filePath).delete(filePath, false); - } catch (IOException ignored) - { - } - throw new CompletionException(e); } + throw new CompletionException(e); } }); } - private void removeCheckpointFile(long timestamp, CheckpointType type) { String prefix = (type == CheckpointType.GC) ? RetinaUtils.CHECKPOINT_PREFIX_GC : RetinaUtils.CHECKPOINT_PREFIX_OFFLOAD; - String fileName = RetinaUtils.getCheckpointFileName(prefix, retinaHostName, timestamp); - String path = checkpointDir.endsWith("/") ? checkpointDir + fileName : checkpointDir + "/" + fileName; + String path = RetinaUtils.buildCheckpointPath(checkpointDir, prefix, retinaHostName, timestamp); try { @@ -389,7 +512,7 @@ private void removeCheckpointFile(long timestamp, CheckpointType type) public void reclaimVisibility(long fileId, int rgId, long timestamp) throws RetinaException { - String retinaKey = fileId + "_" + rgId; + String retinaKey = RetinaUtils.buildRgKey(fileId, rgId); RGVisibility rgVisibility = this.rgVisibilityMap.remove(retinaKey); if (rgVisibility != null) { @@ -397,6 +520,62 @@ public void reclaimVisibility(long fileId, int rgId, long timestamp) throws Reti } } + /** + * Enqueues an old file for delayed cleanup after a configurable wall-clock + * grace period has elapsed. + */ + public void scheduleRetiredFile(RetiredFile retiredFile) + { + retiredFiles.add(retiredFile); + } + + /** + * Processes the retired files queue: for each file whose wall-clock + * {@code retireTimestamp} deadline has passed, removes its Visibility + * entries and deletes the physical file. + */ + public void processRetiredFiles() + { + long now = System.currentTimeMillis(); + retiredFiles.removeIf(rf -> + { + if (now <= rf.retireTimestamp) + { + return false; + } + for (int rgId = 0; rgId < rf.rgCount; rgId++) + { + try + { + reclaimVisibility(rf.fileId, rgId, 0); + } + catch (Exception e) + { + logger.warn("processRetiredFiles: failed to reclaim Visibility for fileId={}, rgId={}", + rf.fileId, rgId, e); + } + } + // Old MainIndex entries for retired files are purged lazily by the + // MainIndex implementation; no explicit cleanup is needed here. + if (rf.filePath != null) + { + try + { + Storage storage = StorageFactory.Instance().getStorage(rf.filePath); + if (storage.exists(rf.filePath)) + { + storage.delete(rf.filePath, false); + } + } + catch (IOException e) + { + logger.warn("processRetiredFiles: failed to delete physical file {}", rf.filePath, e); + } + } + return true; + }); + } + public String getCheckpointPath(long timestamp) { return offloadedCheckpoints.get(timestamp); @@ -404,8 +583,55 @@ public String getCheckpointPath(long timestamp) public void deleteRecord(long fileId, int rgId, int rgRowOffset, long timestamp) throws RetinaException { - RGVisibility rgVisibility = checkRGVisibility(fileId, rgId); - rgVisibility.deleteRecord(rgRowOffset, timestamp); + checkRGVisibility(fileId, rgId).deleteRecord(rgRowOffset, timestamp); + + if (!isDualWriteActive) + { + return; + } + + redirectionLock.readLock().lock(); + try + { + StorageGarbageCollector.RewriteResult result = dualWriteLookup.get(fileId); + if (result == null) + { + return; + } + + if (fileId == result.newFileId) + { + // Backward: new file delete → sync to each old file + for (StorageGarbageCollector.BackwardInfo bwd : result.backwardInfos) + { + int[] bwdMapping = bwd.backwardRgMappings.get(rgId); + if (bwdMapping != null && rgRowOffset < bwdMapping.length && bwdMapping[rgRowOffset] >= 0) + { + int oldGlobal = bwdMapping[rgRowOffset]; + int oldRgId = rgIdForGlobalRowOffset(oldGlobal, bwd.oldFileRgRowStart); + int oldRgOff = oldGlobal - bwd.oldFileRgRowStart[oldRgId]; + checkRGVisibility(bwd.oldFileId, oldRgId).deleteRecord(oldRgOff, timestamp); + } + } + } + else + { + // Forward: old file delete → sync to new file + Map fileMapping = result.forwardRgMappings.get(fileId); + int[] fwdMapping = (fileMapping != null) ? fileMapping.get(rgId) : null; + if (fwdMapping != null && rgRowOffset < fwdMapping.length && fwdMapping[rgRowOffset] >= 0) + { + int newGlobal = fwdMapping[rgRowOffset]; + int newRgId = rgIdForGlobalRowOffset(newGlobal, result.newFileRgRowStart); + int newRgOff = newGlobal - result.newFileRgRowStart[newRgId]; + checkRGVisibility(result.newFileId, newRgId).deleteRecord(newRgOff, timestamp); + } + } + } + finally + { + redirectionLock.readLock().unlock(); + } } public void deleteRecord(IndexProto.RowLocation rowLocation, long timestamp) throws RetinaException @@ -413,6 +639,61 @@ public void deleteRecord(IndexProto.RowLocation rowLocation, long timestamp) thr deleteRecord(rowLocation.getFileId(), rowLocation.getRgId(), rowLocation.getRgRowOffset(), timestamp); } + /** + * Registers dual-write redirection so that {@link #deleteRecord} propagates + * deletes between old and new files. The write lock acts as a barrier: all + * prior deletes have completed before this returns, and all subsequent deletes + * will see the new mappings. + */ + void registerDualWrite(StorageGarbageCollector.RewriteResult result) + { + redirectionLock.writeLock().lock(); + try + { + for (Long oldFileId : result.forwardRgMappings.keySet()) + { + dualWriteLookup.put(oldFileId, result); + } + dualWriteLookup.put(result.newFileId, result); + isDualWriteActive = true; + } + finally + { + redirectionLock.writeLock().unlock(); + } + } + + /** + * Removes the dual-write redirection for the given rewrite result. + */ + void unregisterDualWrite(StorageGarbageCollector.RewriteResult result) + { + redirectionLock.writeLock().lock(); + try + { + for (Long oldFileId : result.forwardRgMappings.keySet()) + { + dualWriteLookup.remove(oldFileId); + } + dualWriteLookup.remove(result.newFileId); + isDualWriteActive = !dualWriteLookup.isEmpty(); + } + finally + { + redirectionLock.writeLock().unlock(); + } + } + + long[] exportChainItemsAfter(long fileId, int rgId, long safeGcTs) throws RetinaException + { + return checkRGVisibility(fileId, rgId).exportChainItemsAfter(safeGcTs); + } + + void importDeletionChain(long fileId, int rgId, long[] items) throws RetinaException + { + checkRGVisibility(fileId, rgId).importDeletionChain(items); + } + public void addWriteBuffer(String schemaName, String tableName) throws RetinaException { try @@ -431,7 +712,7 @@ public void addWriteBuffer(String schemaName, String tableName) throws RetinaExc List columnTypes = columns.stream().map(Column::getType).collect(Collectors.toList()); TypeDescription schema = TypeDescription.createSchemaFromStrings(columnNames, columnTypes); - String writeBufferKey = schemaName + "_" + tableName; + String writeBufferKey = RetinaUtils.buildWriteBufferKey(schemaName, tableName); Map nodeBuffers = pixelsWriteBufferMap.computeIfAbsent( writeBufferKey, k -> new ConcurrentHashMap<>()); @@ -566,7 +847,7 @@ public RetinaProto.GetWriteBufferResponse.Builder getWriteBuffer(String schemaNa */ private RGVisibility checkRGVisibility(long fileId, int rgId) throws RetinaException { - String retinaKey = fileId + "_" + rgId; + String retinaKey = RetinaUtils.buildRgKey(fileId, rgId); RGVisibility rgVisibility = this.rgVisibilityMap.get(retinaKey); if (rgVisibility == null) { @@ -575,12 +856,34 @@ private RGVisibility checkRGVisibility(long fileId, int rgId) throws RetinaExcep return rgVisibility; } + /** + * Binary-searches {@code rgRowStart} (a sentinel-terminated cumulative array) to find + * the RG id that contains the given global row offset. + */ + static int rgIdForGlobalRowOffset(int globalOffset, int[] rgRowStart) + { + int lo = 0, hi = rgRowStart.length - 2; + while (lo < hi) + { + int mid = (lo + hi + 1) >>> 1; + if (rgRowStart[mid] <= globalOffset) + { + lo = mid; + } + else + { + hi = mid - 1; + } + } + return lo; + } + /** * Check if the writer buffer exists for the given schema and table. */ private PixelsWriteBuffer checkPixelsWriteBuffer(String schema, String table, int vNodeId) throws RetinaException { - String writeBufferKey = schema + "_" + table; + String writeBufferKey = RetinaUtils.buildWriteBufferKey(schema, table); Map nodeBuffers = this.pixelsWriteBufferMap.get(writeBufferKey); PixelsWriteBuffer writeBuffer = nodeBuffers.get(vNodeId); if (writeBuffer == null) @@ -592,10 +895,37 @@ private PixelsWriteBuffer checkPixelsWriteBuffer(String schema, String table, in } /** - * Run garbage collection on all registered RGVisibility. + * Run a full GC cycle: Memory GC → checkpoint → Storage GC. + * + *

    Ordering rationale: + *

      + *
    1. Memory GC first: {@code collectTileGarbage} compacts Deletion Chain blocks + * whose last item ts ≤ lwm into {@code baseBitmap}. After compaction, the remaining + * chain starts at the first block that straddles the lwm boundary, so the subsequent + * {@code getVisibilityBitmap(lwm)} call traverses at most one partial block + * (≤ {@code BLOCK_CAPACITY} items) instead of the entire pre-GC chain. This makes + * checkpoint bitmap serialisation significantly cheaper.
    2. + *
    3. Checkpoint second, unconditional and blocking: written regardless of whether + * Storage GC finds any candidate files. The {@code .join()} ensures the checkpoint + * file is fully on disk before Storage GC begins rewriting any files, so crash + * recovery can always restore the post-Memory-GC visibility state independently of + * any in-progress Storage GC rewrite. {@code gcExecutor} is single-threaded, so the + * blocking join is also the simplest way to guarantee no two GC cycles overlap.
    4. + *
    5. Storage GC third: requires an up-to-date {@code baseBitmap} (hence after + * Memory GC) and its own WAL for crash recovery. Placing it after the checkpoint + * keeps the two recovery paths independent: on restart, the GC checkpoint restores + * the post-Memory-GC visibility state, and the GcWal resumes any in-progress Storage + * GC task separately. Once scan completes, bitmaps for non-candidate files are + * immediately released from memory (they are no longer needed by subsequent phases).
    6. + *
    7. Advance {@code latestGcTimestamp} last: updated only after the entire cycle + * succeeds (Memory GC + checkpoint + Storage GC). If any step throws, the timestamp + * is not advanced and the next scheduled invocation will retry the full cycle.
    8. + *
    */ private void runGC() { + processRetiredFiles(); + long timestamp = 0; try { @@ -613,13 +943,70 @@ private void runGC() try { - // 1. Persist first - createCheckpoint(timestamp, CheckpointType.GC); - // 2. Then clean memory - for (Map.Entry entry: this.rgVisibilityMap.entrySet()) + // Step 1: Single pass over rgVisibilityMap — Memory GC + file-level stats + // aggregation + CheckpointEntry pre-building. Produces everything needed by + // checkpoint and Storage GC without any additional traversal. + Map gcSnapshotBitmaps = new HashMap<>(); + Map fileStats = new HashMap<>(); // fileId → {totalRows, totalInvalid} + List checkpointEntries = new ArrayList<>(); + + for (Map.Entry entry : this.rgVisibilityMap.entrySet()) + { + String rgKey = entry.getKey(); + long fileId = RetinaUtils.parseFileIdFromRgKey(rgKey); + int rgId = RetinaUtils.parseRgIdFromRgKey(rgKey); + + long[] bitmap = entry.getValue().garbageCollect(timestamp); + gcSnapshotBitmaps.put(rgKey, bitmap); + + long recordNum = entry.getValue().getRecordNum(); + long rgInvalidCount = 0; + for (long word : bitmap) + { + rgInvalidCount += Long.bitCount(word); + } + final long invalidCount = rgInvalidCount; + + fileStats.compute(fileId, (k, existing) -> { + if (existing == null) + { + return new long[]{recordNum, invalidCount}; + } + existing[0] += recordNum; + existing[1] += invalidCount; + return existing; + }); + + checkpointEntries.add( + new CheckpointFileIO.CheckpointEntry(fileId, rgId, (int) recordNum, bitmap)); + } + + // Step 2: Checkpoint — write pre-built entries directly to disk, skipping + // the second rgVisibilityMap traversal and per-entry thread-pool submission. + createCheckpointDirect(timestamp, CheckpointType.GC, checkpointEntries).join(); + + // Step 3: Storage GC — pass file-level stats so that candidate selection + // uses O(1) lookups instead of per-RG aggregation loops. + if (storageGcEnabled && storageGarbageCollector != null) + { + try + { + storageGarbageCollector.runStorageGC(timestamp, fileStats, gcSnapshotBitmaps); + } + catch (Exception e) + { + logger.error("Storage GC failed", e); + } + } + + // Step 4: Advance the timestamp only after the full cycle succeeds. + // latestGcTimestamp is no longer updated inside createCheckpoint's async + // callback for GC type; this is the single authoritative update point. + long oldGcTs = this.latestGcTimestamp; + this.latestGcTimestamp = timestamp; + if (oldGcTs != -1 && oldGcTs != timestamp) { - RGVisibility rgVisibility = entry.getValue(); - rgVisibility.garbageCollect(timestamp); + removeCheckpointFile(oldGcTs, CheckpointType.GC); } } catch (Exception e) { @@ -683,26 +1070,24 @@ public void recoverCheckpoints() logger.info("Loading system state from GC checkpoint: {}", latestTs); // load to rgVisibilityMap - String fileName = RetinaUtils.getCheckpointFileName(RetinaUtils.CHECKPOINT_PREFIX_GC, retinaHostName, latestTs); - String latestPath = checkpointDir.endsWith("/") ? checkpointDir + fileName : checkpointDir + "/" + fileName; + String latestPath = RetinaUtils.buildCheckpointPath( + checkpointDir, RetinaUtils.CHECKPOINT_PREFIX_GC, retinaHostName, latestTs); try { Storage latestStorage = StorageFactory.Instance().getStorage(latestPath); if (latestStorage.exists(latestPath)) { - // Use CheckpointFileIO for unified read + parallel parsing logic final long ts = latestTs; int rgCount = CheckpointFileIO.readCheckpointParallel(latestPath, entry -> { - rgVisibilityMap.put(entry.fileId + "_" + entry.rgId, - new RGVisibility(entry.recordNum, ts, entry.bitmap)); + addVisibility(entry.fileId, entry.rgId, entry.recordNum, ts, entry.bitmap, true); }, checkpointExecutor); logger.info("Recovered {} RG entries from GC checkpoint", rgCount); } } catch (IOException e) { - logger.error("Failed to read checkpoint file: {}", e); + logger.error("Failed to read checkpoint file", e); } // delete old GC checkpoint files diff --git a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/StorageGarbageCollector.java b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/StorageGarbageCollector.java new file mode 100644 index 0000000000..3083eab906 --- /dev/null +++ b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/StorageGarbageCollector.java @@ -0,0 +1,1369 @@ +/* + * Copyright 2026 PixelsDB. + * + * This file is part of Pixels. + * + * Pixels is free software: you can redistribute it and/or modify + * it under the terms of the Affero GNU General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * Pixels is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Affero GNU General Public License for more details. + * + * You should have received a copy of the Affero GNU General Public + * License along with Pixels. If not, see + * . + */ +package io.pixelsdb.pixels.retina; + +import com.google.protobuf.ByteString; +import io.pixelsdb.pixels.common.exception.MetadataException; +import io.pixelsdb.pixels.common.exception.RetinaException; +import io.pixelsdb.pixels.common.index.IndexOption; +import io.pixelsdb.pixels.common.index.MainIndex; +import io.pixelsdb.pixels.common.index.MainIndexFactory; +import io.pixelsdb.pixels.common.index.RowIdRange; +import io.pixelsdb.pixels.common.index.SinglePointIndexFactory; +import io.pixelsdb.pixels.common.metadata.MetadataService; +import io.pixelsdb.pixels.common.metadata.domain.File; +import io.pixelsdb.pixels.common.metadata.domain.KeyColumns; +import io.pixelsdb.pixels.common.metadata.domain.Layout; +import io.pixelsdb.pixels.common.metadata.domain.Path; +import io.pixelsdb.pixels.common.metadata.domain.Schema; +import io.pixelsdb.pixels.common.metadata.domain.Table; +import io.pixelsdb.pixels.common.physical.Storage; +import io.pixelsdb.pixels.common.physical.StorageFactory; + +import io.pixelsdb.pixels.common.utils.NetUtils; +import io.pixelsdb.pixels.common.utils.PixelsFileNameUtils; +import io.pixelsdb.pixels.common.utils.RetinaUtils; +import io.pixelsdb.pixels.core.PixelsFooterCache; +import io.pixelsdb.pixels.core.PixelsReader; +import io.pixelsdb.pixels.core.PixelsReaderImpl; +import io.pixelsdb.pixels.core.PixelsWriter; +import io.pixelsdb.pixels.core.PixelsWriterImpl; +import io.pixelsdb.pixels.core.TypeDescription; +import io.pixelsdb.pixels.core.encoding.EncodingLevel; +import io.pixelsdb.pixels.core.reader.PixelsReaderOption; +import io.pixelsdb.pixels.core.reader.PixelsRecordReader; +import io.pixelsdb.pixels.core.vector.ColumnVector; +import io.pixelsdb.pixels.core.vector.LongColumnVector; +import io.pixelsdb.pixels.core.vector.VectorizedRowBatch; +import io.pixelsdb.pixels.index.IndexProto; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +/** + * Storage GC: identifies high-deletion-ratio files and rewrites them + * to reclaim physical storage while keeping active queries unaffected. + * + *

    Checkpoint ownership: the GC checkpoint is always written unconditionally + * by {@link RetinaResourceManager#runGC()} after Memory GC and before this + * class is invoked. {@code StorageGarbageCollector} never writes checkpoints. + */ +public class StorageGarbageCollector +{ + private static final Logger logger = LogManager.getLogger(StorageGarbageCollector.class); + + private final RetinaResourceManager resourceManager; + private final MetadataService metadataService; + private final double gcThreshold; + private final long targetFileSize; + private final int maxFilesPerGroup; + private final int maxFileGroupsPerRun; + private final int rowGroupSize; + private final EncodingLevel encodingLevel; + private final long retireDelayMs; + + // ------------------------------------------------------------------------- + // Value types + // ------------------------------------------------------------------------- + + /** + * Metadata about a single candidate file: its invalid-row ratio exceeds + * {@link #gcThreshold} and it is eligible for GC rewrite. + */ + static final class FileCandidate + { + final File file; + final String filePath; + final long fileId; + final int rgCount; + final long tableId; + final int virtualNodeId; + final double invalidRatio; + /** Physical file size in bytes, used for greedy group splitting. 0 if unknown. */ + final long fileSizeBytes; + + FileCandidate(File file, String filePath, long fileId, int rgCount, + long tableId, int virtualNodeId, double invalidRatio, + long fileSizeBytes) + { + this.file = file; + this.filePath = filePath; + this.fileId = fileId; + this.rgCount = rgCount; + this.tableId = tableId; + this.virtualNodeId = virtualNodeId; + this.invalidRatio = invalidRatio; + this.fileSizeBytes = fileSizeBytes; + } + } + + /** + * A group of candidate files sharing the same {@code (tableId, virtualNodeId)}. + * Files within the same group may be rewritten together while preserving row-ordering invariants. + */ + static final class FileGroup + { + final long tableId; + final int virtualNodeId; + final List files; + + FileGroup(long tableId, int virtualNodeId, List files) + { + assert files.stream().allMatch(f -> f.virtualNodeId == virtualNodeId) + : "All files in a FileGroup must share the same virtualNodeId"; + this.tableId = tableId; + this.virtualNodeId = virtualNodeId; + this.files = files; + } + } + + /** + * Reverse mapping for one old file: maps new-file coordinates back to old-file + * global row offsets. One {@code BackwardInfo} per old file in the group. + */ + static final class BackwardInfo + { + final long oldFileId; + /** newRgId → bwdMapping[newRgRowOffset] = oldGlobalRowOffset, or -1 if no mapping */ + final Map backwardRgMappings; + /** oldFileRgRowStart[rgId] = global row offset of first row in that RG; length = rgCount + 1 */ + final int[] oldFileRgRowStart; + + BackwardInfo(long oldFileId, Map backwardRgMappings, int[] oldFileRgRowStart) + { + this.oldFileId = oldFileId; + this.backwardRgMappings = backwardRgMappings; + this.oldFileRgRowStart = oldFileRgRowStart; + } + } + + /** + * Captures a kept row's primary key bytes and create_ts during rewrite, + * for reconstructing the {@link IndexProto.IndexKey} during index synchronisation. + */ + static final class PendingIndexEntry + { + final int newGlobalRowOffset; + final ByteString pkBytes; + final long createTs; + + PendingIndexEntry(int newGlobalRowOffset, ByteString pkBytes, long createTs) + { + this.newGlobalRowOffset = newGlobalRowOffset; + this.pkBytes = pkBytes; + this.createTs = createTs; + } + } + + /** + * Carries everything produced by {@link #rewriteFileGroup}: file metadata, + * per-RG row counts, forward row mappings, backward row mappings, and + * pending index entries captured during rewrite. + */ + static final class RewriteResult + { + final FileGroup group; + final String newFilePath; + final long newFileId; + final int newFileRgCount; + final int[] newFileRgActualRecordNums; + /** Sentinel array: newFileRgRowStart[i] = global row offset of first row in RG i. */ + final int[] newFileRgRowStart; + /** oldFileId → (oldRgId → fwdMapping[oldRgRowOffset] = newGlobalRowOffset, or -1 if deleted) */ + final Map> forwardRgMappings; + /** One {@link BackwardInfo} per old file; empty list when all rows were deleted. */ + final List backwardInfos; + /** PK + create_ts captured for each kept row; empty when primary index is absent. */ + final List pendingIndexEntries; + + /** Set by {@link #syncIndex} after allocating new rowIds. */ + long newRowIdStart = -1; + /** Set by {@link #syncIndex} after updating SinglePointIndex; old rowIds that were replaced. */ + List oldRowIds; + + RewriteResult(FileGroup group, String newFilePath, long newFileId, + int newFileRgCount, int[] newFileRgActualRecordNums, int[] newFileRgRowStart, + Map> forwardRgMappings, + List backwardInfos, + List pendingIndexEntries) + { + this.group = group; + this.newFilePath = newFilePath; + this.newFileId = newFileId; + this.newFileRgCount = newFileRgCount; + this.newFileRgActualRecordNums = newFileRgActualRecordNums; + this.newFileRgRowStart = newFileRgRowStart; + this.forwardRgMappings = forwardRgMappings; + this.backwardInfos = backwardInfos; + this.pendingIndexEntries = pendingIndexEntries; + } + } + + // ------------------------------------------------------------------------- + // Constructor + // ------------------------------------------------------------------------- + + StorageGarbageCollector(RetinaResourceManager resourceManager, + MetadataService metadataService, + double gcThreshold, + long targetFileSize, + int maxFilesPerGroup, + int maxFileGroupsPerRun, + int rowGroupSize, + EncodingLevel encodingLevel, + long retireDelayMs) + { + this.resourceManager = resourceManager; + this.metadataService = metadataService; + this.gcThreshold = gcThreshold; + this.targetFileSize = targetFileSize; + this.maxFilesPerGroup = maxFilesPerGroup; + this.maxFileGroupsPerRun = maxFileGroupsPerRun; + this.rowGroupSize = rowGroupSize; + this.encodingLevel = encodingLevel; + this.retireDelayMs = retireDelayMs; + } + + // ------------------------------------------------------------------------- + // Public entry point + // ------------------------------------------------------------------------- + + /** + * Runs one Storage GC cycle: identify candidates, trim non-candidate bitmaps, + * then scan metadata and process candidate file groups. + * + *

    The GC checkpoint has already been written unconditionally by + * {@link RetinaResourceManager#runGC()} before this method is called. + * + * @param safeGcTs safe GC timestamp produced by Memory GC + * @param fileStats file-level visibility statistics pre-computed during Memory GC; + * key = fileId, value = {@code long[]{totalRows, totalInvalidCount}}. + * Replaces the old per-RG {@code rgStats} map, eliminating the + * per-RG aggregation loop in candidate selection. + * @param gcSnapshotBitmaps per-RG snapshot bitmaps (mutated in-place: non-candidate + * entries removed to reduce memory pressure) + */ + void runStorageGC(long safeGcTs, Map fileStats, + Map gcSnapshotBitmaps) + { + // Pre-compute candidate file IDs from file-level stats (O(1) per file). + Set candidateFileIds = new HashSet<>(); + for (Map.Entry entry : fileStats.entrySet()) + { + long[] stats = entry.getValue(); + if (stats[0] > 0 && (double) stats[1] / stats[0] > gcThreshold) + { + candidateFileIds.add(entry.getKey()); + } + } + if (candidateFileIds.isEmpty()) + { + return; + } + + // Trim non-candidate bitmap entries immediately. The checkpoint has already been + // written with the full snapshot, so only candidate bitmaps are needed for rewriting. + gcSnapshotBitmaps.entrySet().removeIf(e -> + !candidateFileIds.contains(RetinaUtils.parseFileIdFromRgKey(e.getKey()))); + + List fileGroups = scanAndGroupFiles(candidateFileIds, fileStats); + if (!fileGroups.isEmpty()) + { + processFileGroups(fileGroups, safeGcTs, gcSnapshotBitmaps); + } + } + + /** + * Scans all schemas/tables and returns at most {@link #maxFileGroupsPerRun} groups of + * candidate files, sorted by average {@code invalidRatio} descending. + * + *

    Only files whose ID appears in {@code candidateFileIds} are considered; all others + * are skipped immediately. File-level stats (totalRows, invalidCount) are read from + * {@code fileStats} in O(1) — the old per-RG aggregation loop is eliminated. + * + * @param candidateFileIds file IDs that exceed the {@link #gcThreshold}, pre-computed + * in {@link #runStorageGC} + * @param fileStats file-level visibility statistics; key = fileId, + * value = {@code long[]{totalRows, totalInvalidCount}} + */ + List scanAndGroupFiles(Set candidateFileIds, + Map fileStats) + { + List candidates = new ArrayList<>(); + + List schemas; + try + { + schemas = metadataService.getSchemas(); + } + catch (MetadataException e) + { + logger.error("Storage GC: failed to retrieve schemas", e); + return Collections.emptyList(); + } + + for (Schema schema : schemas) + { + List

    TypeWriterGC eligible
    retina{@code FileWriterManager} (CDC real-time path)yes
    ordered{@code IndexedPixelsConsumer} (indexed batch load)yes
    ordered{@code FileWriterManager} (CDC real-time path) / + * {@code IndexedPixelsConsumer} (indexed batch load)yes
    compact{@code CompactExecutor}yes
    single{@code SimplePixelsConsumer} (non-indexed batch load)no
    copy{@code CopyExecutor} (test/benchmark data amplification)no
    tables; + try + { + tables = metadataService.getTables(schema.getName()); + } + catch (MetadataException e) + { + logger.warn("Storage GC: failed to get tables for schema '{}', skipping", + schema.getName(), e); + continue; + } + + for (Table table : tables) + { + Layout layout; + try + { + layout = metadataService.getLatestLayout(schema.getName(), table.getName()); + } + catch (MetadataException e) + { + logger.warn("Storage GC: failed to get layout for {}.{}, skipping", + schema.getName(), table.getName(), e); + continue; + } + if (layout == null) + { + continue; + } + + List paths = new ArrayList<>(); + paths.addAll(layout.getOrderedPaths()); + paths.addAll(layout.getCompactPaths()); + + for (Path path : paths) + { + List files; + try + { + files = metadataService.getFiles(path.getId()); + } + catch (MetadataException e) + { + logger.warn("Storage GC: failed to get files for pathId={}, skipping", + path.getId(), e); + continue; + } + + Storage pathStorage = null; + for (File file : files) + { + if (!candidateFileIds.contains(file.getId())) + { + continue; + } + + String filePath = File.getFilePath(path, file); + + if (!PixelsFileNameUtils.isGcEligible(filePath)) + { + continue; + } + + long[] stats = fileStats.get(file.getId()); + if (stats == null || stats[0] == 0) + { + continue; + } + double invalidRatio = (double) stats[1] / stats[0]; + + long sizeBytes; + try + { + if (pathStorage == null) + { + pathStorage = StorageFactory.Instance().getStorage(filePath); + } + sizeBytes = pathStorage.getStatus(filePath).getLength(); + } + catch (IOException ex) + { + logger.error("Storage GC: cannot stat file {}, skipping candidate", + filePath, ex); + continue; + } + + int vNodeId = PixelsFileNameUtils.extractVirtualNodeId(filePath); + candidates.add(new FileCandidate( + file, filePath, file.getId(), file.getNumRowGroup(), + table.getId(), vNodeId, invalidRatio, sizeBytes)); + } + } + } + } + + return groupAndMerge(candidates); + } + + /** + * Groups candidates by {@code (tableId, virtualNodeId)}, sorts each group by + * {@code invalidRatio} descending, then greedily splits each group into sub-groups + * whose estimated effective data size does not exceed {@link #targetFileSize}. + * + *

    Effective data size per file is estimated as + * {@code fileSizeBytes * (1 - invalidRatio)}. When {@code fileSizeBytes} is + * unknown (0), the file is treated as fitting within any remaining budget — + * i.e. splitting degrades to the old "all-in-one-group" behaviour. + * + *

    If a single file's effective data already exceeds {@code targetFileSize}, + * it forms its own {@link FileGroup}. + * + *

    The returned list is sorted by average {@code invalidRatio} descending and + * capped at {@link #maxFileGroupsPerRun}. + */ + List groupAndMerge(List candidates) + { + Map>> grouped = new LinkedHashMap<>(); + for (FileCandidate c : candidates) + { + grouped.computeIfAbsent(c.tableId, k -> new LinkedHashMap<>()) + .computeIfAbsent(c.virtualNodeId, k -> new ArrayList<>()) + .add(c); + } + + List groups = new ArrayList<>(); + for (Map.Entry>> tableEntry : grouped.entrySet()) + { + long tableId = tableEntry.getKey(); + for (Map.Entry> vnodeEntry : tableEntry.getValue().entrySet()) + { + int vNodeId = vnodeEntry.getKey(); + List files = vnodeEntry.getValue(); + files.sort(Comparator.comparingDouble((FileCandidate c) -> c.invalidRatio).reversed()); + splitIntoGroups(groups, tableId, vNodeId, files); + } + } + + groups.sort(Comparator.comparingDouble( + (FileGroup g) -> g.files.stream().mapToDouble(c -> c.invalidRatio).average().orElse(0.0)) + .reversed()); + + if (groups.size() > maxFileGroupsPerRun) + { + return groups.subList(0, maxFileGroupsPerRun); + } + return groups; + } + + /** + * Greedily packs {@code files} (already sorted by invalidRatio desc) into + * sub-groups bounded by both {@link #targetFileSize} (effective output bytes) + * and {@link #maxFilesPerGroup} (old file count). Whichever limit is reached + * first triggers a group flush. + */ + private void splitIntoGroups(List out, long tableId, int vNodeId, + List files) + { + if (targetFileSize <= 0 && maxFilesPerGroup <= 0) + { + out.add(new FileGroup(tableId, vNodeId, files)); + return; + } + + List current = new ArrayList<>(); + long currentEffectiveBytes = 0; + + for (FileCandidate fc : files) + { + long effectiveBytes = fc.fileSizeBytes > 0 + ? (long) (fc.fileSizeBytes * (1.0 - fc.invalidRatio)) : 0L; + + boolean singleFileOversized = targetFileSize > 0 && effectiveBytes > targetFileSize; + if (singleFileOversized) + { + if (!current.isEmpty()) + { + out.add(new FileGroup(tableId, vNodeId, current)); + current = new ArrayList<>(); + currentEffectiveBytes = 0; + } + out.add(new FileGroup(tableId, vNodeId, Collections.singletonList(fc))); + continue; + } + + boolean sizeWouldExceed = targetFileSize > 0 + && currentEffectiveBytes + effectiveBytes > targetFileSize; + boolean fileCountFull = maxFilesPerGroup > 0 + && current.size() >= maxFilesPerGroup; + + if ((sizeWouldExceed || fileCountFull) && !current.isEmpty()) + { + out.add(new FileGroup(tableId, vNodeId, current)); + current = new ArrayList<>(); + currentEffectiveBytes = 0; + } + current.add(fc); + currentEffectiveBytes += effectiveBytes; + } + if (!current.isEmpty()) + { + out.add(new FileGroup(tableId, vNodeId, current)); + } + } + + + + /** + * Computes the cumulative row-start offsets for an old file's RGs. + * {@code starts[rgId]} = global row offset of the first row in RG {@code rgId}; + * {@code starts[rgCount]} = total row count (sentinel). + */ + private static int[] computeOldFileRgRowStart(Map rgMappings, int rgCount) + { + int[] starts = new int[rgCount + 1]; + int accum = 0; + for (int rgId = 0; rgId < rgCount; rgId++) + { + starts[rgId] = accum; + int[] mapping = rgMappings.get(rgId); + accum += (mapping != null) ? mapping.length : 0; + } + starts[rgCount] = accum; + return starts; + } + + /** + * Registers dual-write for the given rewrite result so that subsequent + * {@link RetinaResourceManager#deleteRecord} calls propagate deletes + * between old and new files. + */ + void registerDualWrite(RewriteResult result) + { + resourceManager.registerDualWrite(result); + } + + /** + * Removes dual-write for the given rewrite result. + */ + void unregisterDualWrite(RewriteResult result) + { + resourceManager.unregisterDualWrite(result); + } + + /** + * Processes the candidate file groups produced by {@link #scanAndGroupFiles}. + * Non-candidate bitmap entries have already been trimmed in {@link #runStorageGC}. + * + * @param fileGroups non-empty list of candidate groups + * @param safeGcTs safe GC timestamp produced by Memory GC + * @param gcSnapshotBitmaps per-RG snapshot bitmaps (already trimmed to candidates) + */ + void processFileGroups(List fileGroups, long safeGcTs, + Map gcSnapshotBitmaps) + { + for (FileGroup group : fileGroups) + { + processFileGroup(group, safeGcTs, gcSnapshotBitmaps); + } + } + + + /** + * Rewrites all files in one {@link FileGroup} into a single new file, filtering out + * rows marked as deleted in {@code gcSnapshotBitmaps}. + * + *

    The new file is registered as {@code TEMPORARY} in the catalog and its + * {@link RGVisibility} objects are initialised with {@code baseTimestamp = safeGcTs}. + * + *

    After rewriting completes the {@code gcSnapshotBitmaps} entries for this group + * are removed (they are no longer needed by subsequent steps). + * + * @param group candidate file group produced by {@link #scanAndGroupFiles} + * @param safeGcTs safe GC timestamp; used as the base timestamp for new-file Visibility + * @param gcSnapshotBitmaps per-RG deletion bitmaps; entries for this group are removed on exit + * @return rewrite result carrying file metadata and row mappings + */ + RewriteResult rewriteFileGroup(FileGroup group, long safeGcTs, + Map gcSnapshotBitmaps) throws Exception + { + String firstFilePath = group.files.get(0).filePath; + Storage storage = StorageFactory.Instance().getStorage(firstFilePath); + String dirUri = firstFilePath.substring(0, firstFilePath.lastIndexOf("/")); + String newFileName = PixelsFileNameUtils.buildOrderedFileName( + NetUtils.getLocalHostName(), group.virtualNodeId); + String newFilePath = dirUri + "/" + newFileName; + + // Open the first old file once to read schema + writer parameters. + // hasHiddenColumn is read here and propagated to the new-file writer so that + // the hidden create_ts column is preserved in the rewritten file. Without it, + // queries reading the new file would lose the ability to filter by create_ts, + // making rows with create_ts > safeGcTs incorrectly visible to snapshots between + // safeGcTs and their actual create_ts. + // One footer cache per rewrite call; shared across all readers for this file group. + PixelsFooterCache footerCache = new PixelsFooterCache(); + + TypeDescription schema; + int pixelStride; + int compressionBlockSize; + boolean hasHiddenColumn; + try (PixelsReader firstReader = PixelsReaderImpl.newBuilder() + .setStorage(storage).setPath(firstFilePath) + .setPixelsFooterCache(footerCache).build()) + { + schema = firstReader.getFileSchema(); + pixelStride = (int) firstReader.getPixelStride(); + compressionBlockSize = (int) firstReader.getCompressionBlockSize(); + hasHiddenColumn = firstReader.getPostScript().getHasHiddenColumn(); + } + + int globalNewRowOffset = 0; + Map> forwardRgMappings = new HashMap<>(); + int nUserCols = schema.getChildren().size(); + String[] includeColNames = schema.getFieldNames().toArray(new String[0]); + List pendingIndexEntries = new ArrayList<>(); + + // Resolve PK columns for index key capture; null if no primary index exists. + int[] pkColIndices = null; + List pkColTypes = null; + try + { + io.pixelsdb.pixels.common.metadata.domain.SinglePointIndex primaryIndex = + metadataService.getPrimaryIndex(group.tableId); + if (primaryIndex != null) + { + KeyColumns keyColumns = primaryIndex.getKeyColumns(); + List colIds = keyColumns.getKeyColumnIds(); + pkColIndices = new int[colIds.size()]; + pkColTypes = new ArrayList<>(colIds.size()); + List children = schema.getChildren(); + for (int i = 0; i < colIds.size(); i++) + { + int colId = colIds.get(i); + pkColIndices[i] = colId; + pkColTypes.add(children.get(colId)); + } + } + } + catch (MetadataException e) + { + logger.warn("StorageGC: failed to resolve primary index for tableId={}, index sync will be skipped", + group.tableId, e); + } + + try (PixelsWriter writer = PixelsWriterImpl.newBuilder() + .setSchema(schema).setPixelStride(pixelStride) + .setRowGroupSize(rowGroupSize).setStorage(storage) + .setPath(newFilePath).setOverwrite(false) + .setEncodingLevel(encodingLevel) + .setCompressionBlockSize(compressionBlockSize) + .setHasHiddenColumn(hasHiddenColumn) + .build()) + { + int batchCapacity = VectorizedRowBatch.DEFAULT_SIZE; + int[] selected = new int[batchCapacity]; + // filteredBatch extends cols[] with one extra LongColumnVector for create_ts + // when hasHiddenColumn=true. Per-column addSelected is used because the + // source batch's cols[] does not include the hidden column slot. + VectorizedRowBatch filteredBatch = schema.createRowBatch(batchCapacity); + if (hasHiddenColumn) + { + ColumnVector[] ext = Arrays.copyOf(filteredBatch.cols, nUserCols + 1); + ext[nUserCols] = new LongColumnVector(batchCapacity); + filteredBatch.cols = ext; + } + PixelsReaderOption opt = new PixelsReaderOption(); + opt.includeCols(includeColNames); + opt.exposeHiddenColumn(hasHiddenColumn); + + for (FileCandidate fc : group.files) + { + try (PixelsReader reader = PixelsReaderImpl.newBuilder() + .setStorage(storage).setPath(fc.filePath) + .setPixelsFooterCache(footerCache).build()) + { + for (int oldRgId = 0; oldRgId < reader.getRowGroupNum(); oldRgId++) + { + long[] gcBitmap = gcSnapshotBitmaps.get(RetinaUtils.buildRgKey(fc.fileId, oldRgId)); + int rgRecordNum = reader.getRowGroupInfo(oldRgId).getNumberOfRows(); + + // transTimestamp is not set: GC filtering uses gcSnapshotBitmap + // exclusively. Setting it would activate the hidden-timestamp + // filter and wrongly exclude alive rows with create_ts > safeGcTs. + opt.rgRange(oldRgId, 1); + + int oldRgRowOffset = 0; + int[] fwdMapping = new int[rgRecordNum]; + + try (PixelsRecordReader recordReader = reader.read(opt)) + { + VectorizedRowBatch batch; + while ((batch = recordReader.readBatch()) != null && batch.size > 0) + { + // GC row filter: a row is excluded iff its bit is set in + // gcSnapshotBitmap, meaning delete_ts <= safeGcTs. Rows with + // create_ts > safeGcTs are kept as long as their bit is 0 + // (not yet deleted or deleted after safeGcTs). + int kept = 0; + for (int r = 0; r < batch.size; r++, oldRgRowOffset++) + { + // Check if the row's bit is set in the deletion bitmap (each long holds 64 bits) + if (gcBitmap != null && (gcBitmap[oldRgRowOffset >>> 6] & (1L << (oldRgRowOffset & 63))) != 0) + { + fwdMapping[oldRgRowOffset] = -1; + } + else + { + selected[kept++] = r; + fwdMapping[oldRgRowOffset] = globalNewRowOffset; + if (pkColIndices != null) + { + ByteString pkBytes = extractPkBytes(batch, r, pkColIndices, pkColTypes); + long createTs = hasHiddenColumn + ? ((LongColumnVector) batch.getHiddenColumnVector()).vector[r] + : 0L; + pendingIndexEntries.add( + new PendingIndexEntry(globalNewRowOffset, pkBytes, createTs)); + } + globalNewRowOffset++; + } + } + if (kept > 0) + { + for (int i = 0; i < nUserCols; i++) + { + filteredBatch.cols[i].addSelected(selected, 0, kept, batch.cols[i]); + } + if (hasHiddenColumn) + { + ((LongColumnVector) filteredBatch.cols[nUserCols]) + .addSelected(selected, 0, kept, batch.getHiddenColumnVector()); + } + filteredBatch.size = kept; + writer.addRowBatch(filteredBatch); + filteredBatch.reset(); + } + } + } + forwardRgMappings.computeIfAbsent(fc.fileId, k -> new HashMap<>()).put(oldRgId, fwdMapping); + } + } + } + } // writer.close() + + // Release the gcSnapshotBitmaps for this group; rewriting is done. + for (FileCandidate fc : group.files) + { + for (int rgId = 0; rgId < fc.rgCount; rgId++) + { + gcSnapshotBitmaps.remove(RetinaUtils.buildRgKey(fc.fileId, rgId)); + } + } + + // Edge case: all rows in the group were deleted — skip catalog registration, + // delete the empty file, and return early. The old files will be cleaned up + // by the delayed-cleanup phase once it is implemented. + if (globalNewRowOffset == 0) + { + logger.info("StorageGC: all rows deleted for table={}, vNodeId={}, skipping empty file", + group.tableId, group.virtualNodeId); + try + { + storage.delete(newFilePath, false); + } + catch (IOException e) + { + logger.warn("StorageGC: failed to delete empty rewrite file {}", newFilePath, e); + } + return new RewriteResult(group, newFilePath, -1, + 0, new int[0], new int[]{0}, forwardRgMappings, Collections.emptyList(), + Collections.emptyList()); + } + + // Read the new file's Footer to get per-RG row counts. + int newFileRgCount; + int[] newFileRgActualRecordNums; + int[] newFileRgRowStart; + try (PixelsReader newReader = PixelsReaderImpl.newBuilder() + .setStorage(storage).setPath(newFilePath) + .setPixelsFooterCache(footerCache).build()) + { + newFileRgCount = newReader.getRowGroupNum(); + newFileRgActualRecordNums = new int[newFileRgCount]; + newFileRgRowStart = new int[newFileRgCount + 1]; + int accum = 0; + for (int rgId = 0; rgId < newFileRgCount; rgId++) + { + newFileRgActualRecordNums[rgId] = newReader.getRowGroupInfo(rgId).getNumberOfRows(); + newFileRgRowStart[rgId] = accum; + accum += newFileRgActualRecordNums[rgId]; + } + newFileRgRowStart[newFileRgCount] = accum; + } + + // Build backward mappings by inverting the forward mappings. + // newGlobal values are globally monotonic (globalNewRowOffset is a single counter), + // so we advance a linear cursor instead of binary-searching per row. + List backwardInfos = new ArrayList<>(); + int curNewRgId = 0; + for (FileCandidate fc : group.files) + { + Map rgMappings = forwardRgMappings.get(fc.fileId); + int[] oldFileRgRowStart = computeOldFileRgRowStart(rgMappings, fc.rgCount); + Map bwdMappings = new HashMap<>(); + for (int oldRgId = 0; oldRgId < fc.rgCount; oldRgId++) + { + int[] fwdMapping = rgMappings.get(oldRgId); + if (fwdMapping == null) + { + continue; + } + for (int oldOff = 0; oldOff < fwdMapping.length; oldOff++) + { + int newGlobal = fwdMapping[oldOff]; + if (newGlobal < 0) + { + continue; + } + while (curNewRgId + 1 < newFileRgCount + && newGlobal >= newFileRgRowStart[curNewRgId + 1]) + { + curNewRgId++; + } + int newRgOff = newGlobal - newFileRgRowStart[curNewRgId]; + int oldGlobal = oldFileRgRowStart[oldRgId] + oldOff; + bwdMappings.computeIfAbsent(curNewRgId, k -> + { + int[] arr = new int[newFileRgActualRecordNums[k]]; + Arrays.fill(arr, -1); + return arr; + })[newRgOff] = oldGlobal; + } + } + backwardInfos.add(new BackwardInfo(fc.fileId, bwdMappings, oldFileRgRowStart)); + } + + // Register the new file as TEMPORARY in the catalog and initialise Visibility. + // Track registration progress so that partial state can be cleaned up on failure. + long newFileId = -1; + int registeredRgCount = 0; + try + { + long minRowId = Long.MAX_VALUE, maxRowId = Long.MIN_VALUE; + for (FileCandidate fc : group.files) + { + minRowId = Math.min(minRowId, fc.file.getMinRowId()); + maxRowId = Math.max(maxRowId, fc.file.getMaxRowId()); + } + File newFile = new File(); + newFile.setName(newFileName); + newFile.setType(File.Type.TEMPORARY); + newFile.setNumRowGroup(newFileRgCount); + newFile.setMinRowId(minRowId); + newFile.setMaxRowId(maxRowId); + newFile.setPathId(group.files.get(0).file.getPathId()); + metadataService.addFiles(Collections.singletonList(newFile)); + newFileId = metadataService.getFileId(newFilePath); + + for (int rgId = 0; rgId < newFileRgCount; rgId++) + { + resourceManager.addVisibility(newFileId, rgId, newFileRgActualRecordNums[rgId], safeGcTs, null, false); + registeredRgCount = rgId + 1; + } + } + catch (Exception e) + { + cleanupTemporaryFile(storage, newFilePath, newFileId, registeredRgCount); + throw e; + } + + return new RewriteResult(group, newFilePath, newFileId, + newFileRgCount, newFileRgActualRecordNums, newFileRgRowStart, + forwardRgMappings, backwardInfos, pendingIndexEntries); + } + + /** + * Best-effort cleanup of a partially-created TEMPORARY file. Removes the + * catalog record, the physical file, and any RGVisibility keys that were + * registered before the failure. + */ + private void cleanupTemporaryFile(Storage storage, String newFilePath, + long newFileId, int registeredRgCount) + { + if (newFileId > 0) + { + for (int rgId = 0; rgId < registeredRgCount; rgId++) + { + try + { + resourceManager.reclaimVisibility(newFileId, rgId, 0); + } + catch (Exception ex) + { + logger.warn("StorageGC cleanup: failed to remove Visibility for fileId={}, rgId={}", newFileId, rgId, ex); + } + } + try + { + metadataService.deleteFiles(Collections.singletonList(newFileId)); + } + catch (Exception ex) + { + logger.warn("StorageGC cleanup: failed to delete catalog entry for fileId={}", newFileId, ex); + } + } + try + { + if (storage.exists(newFilePath)) + { + storage.delete(newFilePath, false); + } + } + catch (IOException ex) + { + logger.warn("StorageGC cleanup: failed to delete physical file {}", newFilePath, ex); + } + } + + // ------------------------------------------------------------------------- + // Visibility Synchronization + // ------------------------------------------------------------------------- + + /** + * Exports deletion chain items from old files, performs coordinate transformation, + * and imports them into the corresponding new file RGs. + * + * @param result the rewrite result containing forward mappings and new file metadata + * @param safeGcTs the safe GC timestamp; only chain items with ts > safeGcTs are exported + */ + void syncVisibility(RewriteResult result, long safeGcTs) throws RetinaException + { + // Buckets keyed by new RG id; values are interleaved (newRgRowOffset, timestamp) pairs + // stored as growable primitive long[] to avoid Long boxing overhead. + Map bucketArrays = new HashMap<>(); + Map bucketSizes = new HashMap<>(); + + for (FileCandidate fc : result.group.files) + { + Map fileMapping = result.forwardRgMappings.get(fc.fileId); + if (fileMapping == null) + { + continue; + } + for (int rgId = 0; rgId < fc.rgCount; rgId++) + { + long[] items = resourceManager.exportChainItemsAfter(fc.fileId, rgId, safeGcTs); + if (items == null || items.length == 0) + { + continue; + } + int[] fwdMapping = fileMapping.get(rgId); + if (fwdMapping == null) + { + continue; + } + + for (int i = 0; i < items.length; i += 2) + { + int oldRgRowOffset = (int) items[i]; + long timestamp = items[i + 1]; + if (oldRgRowOffset < 0 || oldRgRowOffset >= fwdMapping.length) + { + continue; + } + int newGlobal = fwdMapping[oldRgRowOffset]; + if (newGlobal < 0) + { + continue; + } + int newRgId = RetinaResourceManager.rgIdForGlobalRowOffset(newGlobal, result.newFileRgRowStart); + int newRgOff = newGlobal - result.newFileRgRowStart[newRgId]; + + int size = bucketSizes.getOrDefault(newRgId, 0); + long[] arr = bucketArrays.get(newRgId); + if (arr == null) + { + arr = new long[16]; + bucketArrays.put(newRgId, arr); + } + else if (size + 2 > arr.length) + { + arr = Arrays.copyOf(arr, arr.length * 2); + bucketArrays.put(newRgId, arr); + } + arr[size] = newRgOff; + arr[size + 1] = timestamp; + bucketSizes.put(newRgId, size + 2); + } + } + } + + for (Map.Entry entry : bucketArrays.entrySet()) + { + int newRgId = entry.getKey(); + int size = bucketSizes.get(newRgId); + long[] interleaved = (size == entry.getValue().length) + ? entry.getValue() + : Arrays.copyOf(entry.getValue(), size); + resourceManager.importDeletionChain(result.newFileId, newRgId, interleaved); + } + } + + // ------------------------------------------------------------------------- + // PK byte extraction + // ------------------------------------------------------------------------- + + /** + * Extracts and concatenates the primary-key column bytes from a batch row, + * using the same encoding as {@link TypeDescription#convertSqlStringToByte}. + */ + private static ByteString extractPkBytes(VectorizedRowBatch batch, int row, + int[] pkColIndices, List pkColTypes) + { + if (pkColIndices.length == 1) + { + byte[] bytes = pkColTypes.get(0).convertColumnVectorToByte(batch.cols[pkColIndices[0]], row); + return ByteString.copyFrom(bytes); + } + int totalLen = 0; + byte[][] parts = new byte[pkColIndices.length][]; + for (int i = 0; i < pkColIndices.length; i++) + { + parts[i] = pkColTypes.get(i).convertColumnVectorToByte(batch.cols[pkColIndices[i]], row); + totalLen += parts[i].length; + } + ByteBuffer buf = ByteBuffer.allocate(totalLen); + for (byte[] part : parts) + { + buf.put(part); + } + return ByteString.copyFrom((ByteBuffer) buf.rewind()); + } + + // ------------------------------------------------------------------------- + // Index Synchronization + // ------------------------------------------------------------------------- + + /** + * Allocates new rowIds, inserts MainIndex entries, and updates the SinglePointIndex + * for all kept rows in the rewrite result. + * + * @param result the rewrite result (mutated: sets {@code newRowIdStart} and {@code oldRowIds}) + * @param tableId the table owning the rewritten files + */ + void syncIndex(RewriteResult result, long tableId) throws Exception + { + int totalRows = result.newFileRgRowStart[result.newFileRgCount]; + if (totalRows == 0) + { + return; + } + + MainIndex mainIndex = MainIndexFactory.Instance().getMainIndex(tableId); + IndexProto.RowIdBatch rowIdBatch = mainIndex.allocateRowIdBatch(tableId, totalRows); + long newRowIdStart = rowIdBatch.getRowIdStart(); + result.newRowIdStart = newRowIdStart; + + insertMainIndexEntries(result, mainIndex, newRowIdStart); + + if (!result.pendingIndexEntries.isEmpty()) + { + result.oldRowIds = updateSinglePointIndex(result, tableId, newRowIdStart); + } + } + + private void insertMainIndexEntries(RewriteResult result, MainIndex mainIndex, + long newRowIdStart) throws Exception + { + int totalRows = result.newFileRgRowStart[result.newFileRgCount]; + List entries = new ArrayList<>(totalRows); + int curRgId = 0; + for (int i = 0; i < totalRows; i++) + { + while (curRgId + 1 < result.newFileRgCount + && i >= result.newFileRgRowStart[curRgId + 1]) + { + curRgId++; + } + int rgOff = i - result.newFileRgRowStart[curRgId]; + entries.add(IndexProto.PrimaryIndexEntry.newBuilder() + .setRowId(newRowIdStart + i) + .setRowLocation(IndexProto.RowLocation.newBuilder() + .setFileId(result.newFileId).setRgId(curRgId).setRgRowOffset(rgOff)) + .build()); + } + mainIndex.putEntries(entries); + mainIndex.flushCache(result.newFileId); + } + + private List updateSinglePointIndex(RewriteResult result, long tableId, + long newRowIdStart) throws Exception + { + io.pixelsdb.pixels.common.metadata.domain.SinglePointIndex primaryIndex = + metadataService.getPrimaryIndex(tableId); + IndexOption indexOption = IndexOption.builder().vNodeId(result.group.virtualNodeId).build(); + io.pixelsdb.pixels.common.index.SinglePointIndex spIndex = + SinglePointIndexFactory.Instance().getSinglePointIndex( + tableId, primaryIndex.getId(), indexOption); + + List oldRowIds = new ArrayList<>(); + for (PendingIndexEntry pe : result.pendingIndexEntries) + { + long newRowId = newRowIdStart + pe.newGlobalRowOffset; + IndexProto.IndexKey key = IndexProto.IndexKey.newBuilder() + .setTableId(tableId).setIndexId(primaryIndex.getId()) + .setKey(pe.pkBytes).setTimestamp(pe.createTs).build(); + long oldRowId = spIndex.updatePrimaryEntry(key, newRowId); + if (oldRowId >= 0) + { + oldRowIds.add(oldRowId); + } + else + { + logger.warn("StorageGC syncIndex: updatePrimaryEntry returned {} for tableId={}, " + + "newGlobalRowOffset={} — index may be inconsistent", oldRowId, tableId, pe.newGlobalRowOffset); + } + } + return oldRowIds; + } + + // ------------------------------------------------------------------------- + // Commit (atomic switch + delayed cleanup) + // ------------------------------------------------------------------------- + + /** + * Atomically promotes the new TEMPORARY file to REGULAR, deletes old files from + * the catalog, unregisters dual-write, and enqueues the old files for delayed cleanup. + */ + void commitFileGroup(RewriteResult result) throws Exception + { + List oldFileIds = result.group.files.stream() + .map(fc -> fc.fileId).collect(Collectors.toList()); + + try + { + metadataService.atomicSwapFiles(result.newFileId, oldFileIds); + } + catch (Exception e) + { + File newFile = metadataService.getFileById(result.newFileId); + if (newFile != null && newFile.getType() == File.Type.REGULAR) + { + logger.warn("atomicSwapFiles gRPC failed but server committed, continuing", e); + } + else + { + throw e; + } + } + + unregisterDualWrite(result); + + long retireDeadline = System.currentTimeMillis() + retireDelayMs; + for (FileCandidate fc : result.group.files) + { + resourceManager.scheduleRetiredFile( + new RetinaResourceManager.RetiredFile( + fc.fileId, fc.rgCount, fc.filePath, retireDeadline, result.oldRowIds)); + } + } + + // ------------------------------------------------------------------------- + // Rollback + // ------------------------------------------------------------------------- + + /** + * Best-effort rollback of a partially-completed GC cycle for one file group. + * Reverses index changes, dual-write, visibility, catalog, and physical file. + */ + void rollback(RewriteResult result) + { + if (result == null || result.newFileId <= 0) + { + return; + } + try + { + File newFile = metadataService.getFileById(result.newFileId); + if (newFile != null && newFile.getType() == File.Type.REGULAR) + { + logger.error("Cannot rollback: new file already REGULAR (id={})", result.newFileId); + return; + } + + if (result.oldRowIds != null && !result.oldRowIds.isEmpty()) + { + rollbackSinglePointIndex(result); + } + + if (result.newRowIdStart > 0) + { + try + { + int totalRows = result.newFileRgRowStart[result.newFileRgCount]; + MainIndex mainIndex = MainIndexFactory.Instance().getMainIndex(result.group.tableId); + mainIndex.deleteRowIdRange(new RowIdRange(result.newRowIdStart, + result.newRowIdStart + totalRows, result.newFileId, 0, 0, totalRows)); + } + catch (Exception ex) + { + logger.warn("Rollback: failed to clean MainIndex for fileId={}", result.newFileId, ex); + } + } + + unregisterDualWrite(result); + + for (int rgId = 0; rgId < result.newFileRgCount; rgId++) + { + try + { + resourceManager.reclaimVisibility(result.newFileId, rgId, 0); + } + catch (Exception ex) + { + logger.warn("Rollback: failed to remove Visibility for fileId={}, rgId={}", + result.newFileId, rgId, ex); + } + } + + try + { + metadataService.deleteFiles(Collections.singletonList(result.newFileId)); + } + catch (Exception ex) + { + logger.warn("Rollback: failed to delete catalog entry for fileId={}", result.newFileId, ex); + } + + try + { + Storage storage = StorageFactory.Instance().getStorage(result.newFilePath); + if (storage.exists(result.newFilePath)) + { + storage.delete(result.newFilePath, false); + } + } + catch (IOException ex) + { + logger.warn("Rollback: failed to delete physical file {}", result.newFilePath, ex); + } + } + catch (Exception e) + { + logger.error("Rollback failed for FileGroup tableId={}", result.group.tableId, e); + } + } + + private void rollbackSinglePointIndex(RewriteResult result) + { + try + { + io.pixelsdb.pixels.common.metadata.domain.SinglePointIndex primaryIndex = + metadataService.getPrimaryIndex(result.group.tableId); + if (primaryIndex == null) + { + return; + } + IndexOption indexOption = IndexOption.builder().vNodeId(result.group.virtualNodeId).build(); + io.pixelsdb.pixels.common.index.SinglePointIndex spIndex = + SinglePointIndexFactory.Instance().getSinglePointIndex( + result.group.tableId, primaryIndex.getId(), indexOption); + int idx = 0; + for (PendingIndexEntry pe : result.pendingIndexEntries) + { + if (idx >= result.oldRowIds.size()) + { + break; + } + IndexProto.IndexKey key = IndexProto.IndexKey.newBuilder() + .setTableId(result.group.tableId).setIndexId(primaryIndex.getId()) + .setKey(pe.pkBytes).setTimestamp(pe.createTs).build(); + spIndex.updatePrimaryEntry(key, result.oldRowIds.get(idx++)); + } + } + catch (Exception e) + { + logger.warn("Rollback: failed to revert SinglePointIndex for tableId={}", + result.group.tableId, e); + } + } + + // ------------------------------------------------------------------------- + // Per-group orchestration + // ------------------------------------------------------------------------- + + /** + * Processes a single file group through the complete GC pipeline: + * rewrite → dual-write → visibility sync → index sync → commit. + * On failure, releases group bitmaps and performs a best-effort rollback. + */ + void processFileGroup(FileGroup group, long safeGcTs, + Map gcSnapshotBitmaps) + { + RewriteResult result = null; + try + { + result = rewriteFileGroup(group, safeGcTs, gcSnapshotBitmaps); + + if (result.newFileId <= 0) + { + return; + } + + registerDualWrite(result); + + syncVisibility(result, safeGcTs); + + syncIndex(result, group.tableId); + + commitFileGroup(result); + + logger.info("StorageGC completed for FileGroup tableId={}, vNodeId={}, newFileId={}", + group.tableId, group.virtualNodeId, result.newFileId); + } + catch (Exception e) + { + logger.error("StorageGC failed for FileGroup tableId={}, vNodeId={}", + group.tableId, group.virtualNodeId, e); + releaseGroupBitmaps(group, gcSnapshotBitmaps); + rollback(result); + } + } + + private void releaseGroupBitmaps(FileGroup group, Map gcSnapshotBitmaps) + { + for (FileCandidate fc : group.files) + { + for (int rgId = 0; rgId < fc.rgCount; rgId++) + { + gcSnapshotBitmaps.remove(RetinaUtils.buildRgKey(fc.fileId, rgId)); + } + } + } +} diff --git a/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestRGVisibility.java b/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestRGVisibility.java index d21384b8e0..f63bb826dd 100644 --- a/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestRGVisibility.java +++ b/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestRGVisibility.java @@ -42,7 +42,7 @@ public class TestRGVisibility @Before public void setUp() { - rgVisibility = new RGVisibility(ROW_COUNT); + rgVisibility = new RGVisibility(ROW_COUNT, 0L, null); } @After @@ -57,7 +57,16 @@ public void testRGVisibilityInitialized() long timestamp1 = 100; long timestamp2 = 200; - long[] bitmap = {1, 0, 0, 0}; + // Probe the native library to determine per-tile bitmap size, + // which depends on RETINA_CAPACITY set at compile time. + int bitmapWords; + try (RGVisibility probe = new RGVisibility(1, 0L, null)) + { + bitmapWords = probe.getVisibilityBitmap(0).length; + } + + long[] bitmap = new long[bitmapWords]; + bitmap[0] = 1; RGVisibility rgVisibilityInitialized = new RGVisibility(256, 0, bitmap); rgVisibilityInitialized.deleteRecord(5, timestamp1); @@ -69,6 +78,8 @@ public void testRGVisibilityInitialized() long[] bitmap2 = rgVisibilityInitialized.getVisibilityBitmap(timestamp2); assertEquals(0b1000010000100001L, bitmap2[0]); + + rgVisibilityInitialized.close(); } @Test @@ -388,4 +399,314 @@ class DeleteRecord verifyBitmap.accept(maxTimestamp.get(), finalBitmap); } + + // ===================================================================== + // gcSnapshotBitmap JNI round-trip tests + // + // Verify that garbageCollect() (now returning long[]) produces a bitmap + // identical to getVisibilityBitmap() called BEFORE GC — the independent + // ground truth computed from the full, unmodified deletion chain. + // ===================================================================== + + private static void assertBitmapsEqual(String msg, long[] expected, long[] actual) + { + assertEquals(msg + ": length mismatch", expected.length, actual.length); + for (int i = 0; i < expected.length; i++) + { + if (expected[i] != actual[i]) + { + fail(String.format("%s: word %d (rows %d-%d) mismatch%n expected: %s%n actual: %s", + msg, i, i * 64, i * 64 + 63, + String.format("%64s", Long.toBinaryString(expected[i])).replace(' ', '0'), + String.format("%64s", Long.toBinaryString(actual[i])).replace(' ', '0'))); + } + } + } + + @Test + public void testGcSnapshotEarlyReturnA() + { + // Empty chain → all-zero snapshot + long[] preRef0 = rgVisibility.getVisibilityBitmap(100); + long[] snap0 = rgVisibility.garbageCollect(100); + assertBitmapsEqual("empty chain", preRef0, snap0); + for (long w : snap0) + { + assertEquals(0L, w); + } + + // Add deletes and compact to advance baseTimestamp + rgVisibility.deleteRecord(5, 100); + rgVisibility.deleteRecord(10, 100); + rgVisibility.deleteRecord(15, 200); + + // First GC at ts=200 → compact all items + long[] preRef1 = rgVisibility.getVisibilityBitmap(200); + long[] snap1 = rgVisibility.garbageCollect(200); + assertBitmapsEqual("first compact", preRef1, snap1); + + // Second GC at ts=200 → early return A (ts == baseTimestamp) + long[] preRef2 = rgVisibility.getVisibilityBitmap(200); + long[] snap2 = rgVisibility.garbageCollect(200); + assertBitmapsEqual("repeat GC", preRef2, snap2); + + // Both snapshots must be identical + assertBitmapsEqual("snap1 vs snap2", snap1, snap2); + } + + @Test + public void testGcSnapshotEarlyReturnB() + { + // 5 items in one block: ts 1,2,3,8,10. Block last ts=10 > safeGcTs=5 + rgVisibility.deleteRecord(0, 1); + rgVisibility.deleteRecord(1, 2); + rgVisibility.deleteRecord(2, 3); + rgVisibility.deleteRecord(3, 8); + rgVisibility.deleteRecord(4, 10); + + long[] preRef = rgVisibility.getVisibilityBitmap(5); + long[] snapshot = rgVisibility.garbageCollect(5); + assertBitmapsEqual("early return B", preRef, snapshot); + + // Rows 0,1,2 marked (ts ≤ 5); rows 3,4 not + assertEquals(0b111L, snapshot[0]); + } + + @Test + public void testGcSnapshotCompactWithBoundary() + { + // 10 items: rows 0-9, ts 1-10 + // Block 1 (8 items, ts 1-8): compactable at safeGcTs=9 + // Block 2 (2 items, ts 9-10): boundary block (tail) + for (int i = 0; i < 10; i++) + { + rgVisibility.deleteRecord(i, i + 1); + } + + long[] preRef = rgVisibility.getVisibilityBitmap(9); + long[] snapshot = rgVisibility.garbageCollect(9); + assertBitmapsEqual("compact with boundary", preRef, snapshot); + + // Rows 0-8 marked, row 9 not + assertEquals(0x1FFL, snapshot[0]); + } + + @Test + public void testGcSnapshotCompactAllBlocks() + { + // 8 items fill one block: rows 0-7, ts 1-8 + for (int i = 0; i < 8; i++) + { + rgVisibility.deleteRecord(i, i + 1); + } + + // safeGcTs=10 > all item ts → entire block compacted + long[] preRef = rgVisibility.getVisibilityBitmap(10); + long[] snapshot = rgVisibility.garbageCollect(10); + assertBitmapsEqual("compact all blocks", preRef, snapshot); + + assertEquals(0xFFL, snapshot[0]); + } + + @Test + public void testGcSnapshotCompactMultiBlock() + { + // 20 items: rows 0-19, ts 1-20 + // Block 1 (ts 1-8), Block 2 (ts 9-16), Block 3 tail (ts 17-20) + for (int i = 0; i < 20; i++) + { + rgVisibility.deleteRecord(i, i + 1); + } + + // safeGcTs=18: blocks 1,2 compacted, block 3 is boundary + long[] preRef = rgVisibility.getVisibilityBitmap(18); + long[] snapshot = rgVisibility.garbageCollect(18); + assertBitmapsEqual("compact multi-block", preRef, snapshot); + + // Rows 0-17 marked + assertEquals((1L << 18) - 1, snapshot[0]); + } + + @Test + public void testGcSnapshotCrossTile() + { + // Tile 0: rows 0-255 Tile 1: rows 256-511 Tile 2: rows 512-767 + rgVisibility.deleteRecord(5, 1); + rgVisibility.deleteRecord(10, 2); + rgVisibility.deleteRecord(260, 3); // tile 1 + rgVisibility.deleteRecord(600, 4); // tile 2 + rgVisibility.deleteRecord(100, 5); // tile 0 + rgVisibility.deleteRecord(300, 6); // tile 1 + + long[] preRef1 = rgVisibility.getVisibilityBitmap(4); + long[] snap1 = rgVisibility.garbageCollect(4); + assertBitmapsEqual("cross-tile ts=4", preRef1, snap1); + + long[] preRef2 = rgVisibility.getVisibilityBitmap(6); + long[] snap2 = rgVisibility.garbageCollect(6); + assertBitmapsEqual("cross-tile ts=6", preRef2, snap2); + } + + @Test + public void testGcSnapshotProgressiveRounds() + { + // Phase 1: 20 deletes at ts 1-20 + for (int i = 0; i < 20; i++) + { + rgVisibility.deleteRecord(i, i + 1); + } + + long[] preRef1 = rgVisibility.getVisibilityBitmap(5); + long[] snap1 = rgVisibility.garbageCollect(5); + assertBitmapsEqual("round 1", preRef1, snap1); + + long[] preRef2 = rgVisibility.getVisibilityBitmap(12); + long[] snap2 = rgVisibility.garbageCollect(12); + assertBitmapsEqual("round 2", preRef2, snap2); + + // Phase 2: 10 more deletes at ts 21-30 + for (int i = 20; i < 30; i++) + { + rgVisibility.deleteRecord(i, i + 1); + } + + long[] preRef3 = rgVisibility.getVisibilityBitmap(25); + long[] snap3 = rgVisibility.garbageCollect(25); + assertBitmapsEqual("round 3", preRef3, snap3); + + long[] preRef4 = rgVisibility.getVisibilityBitmap(100); + long[] snap4 = rgVisibility.garbageCollect(100); + assertBitmapsEqual("round 4", preRef4, snap4); + + // All 30 rows marked + assertEquals((1L << 30) - 1, snap4[0]); + } + + @Test + public void testGcSnapshotRandomized() + { + Random rng = new Random(42); + boolean[] deleted = new boolean[ROW_COUNT]; + long ts = 1; + long lastGcTs = 0; + + for (int round = 0; round < 10; round++) + { + for (int d = 0; d < 100; d++) + { + int rowId; + do { rowId = rng.nextInt(ROW_COUNT); } while (deleted[rowId]); + deleted[rowId] = true; + rgVisibility.deleteRecord(rowId, ts); + ts++; + } + + long gcTs = lastGcTs + 51; + if (gcTs >= ts) gcTs = ts - 1; + + long[] preRef = rgVisibility.getVisibilityBitmap(gcTs); + long[] snapshot = rgVisibility.garbageCollect(gcTs); + assertBitmapsEqual("randomized round " + round, preRef, snapshot); + lastGcTs = gcTs; + } + + long[] preRefFinal = rgVisibility.getVisibilityBitmap(ts + 100); + long[] finalSnap = rgVisibility.garbageCollect(ts + 100); + assertBitmapsEqual("randomized final", preRefFinal, finalSnap); + } + + // ===================================================================== + // exportChainItemsAfter / importDeletionChain JNI tests + // ===================================================================== + + @Test + public void testExportChainItemsAfter() + { + rgVisibility.deleteRecord(5, 50); + rgVisibility.deleteRecord(10, 100); + rgVisibility.deleteRecord(15, 150); + rgVisibility.deleteRecord(20, 200); + rgVisibility.deleteRecord(300, 250); + + long[] items = rgVisibility.exportChainItemsAfter(100); + + assertNotNull("export should return non-null", items); + assertEquals("interleaved pairs: 3 items × 2", 6, items.length); + + Set exportedTs = new HashSet<>(); + for (int i = 0; i < items.length; i += 2) + { + exportedTs.add(items[i + 1]); + } + assertTrue("should contain ts=150", exportedTs.contains(150L)); + assertTrue("should contain ts=200", exportedTs.contains(200L)); + assertTrue("should contain ts=250", exportedTs.contains(250L)); + assertFalse("should NOT contain ts=50", exportedTs.contains(50L)); + assertFalse("should NOT contain ts=100", exportedTs.contains(100L)); + } + + @Test + public void testImportDeletionChain() + { + long safeGcTs = 100; + try (RGVisibility newVis = new RGVisibility(ROW_COUNT, safeGcTs, null)) + { + long[] items = {5, 150, 10, 200, 300, 250}; + newVis.importDeletionChain(items); + + long[] bitmap = newVis.getVisibilityBitmap(300); + assertTrue("row 5 should be deleted", + (bitmap[5 / 64] & (1L << (5 % 64))) != 0); + assertTrue("row 10 should be deleted", + (bitmap[10 / 64] & (1L << (10 % 64))) != 0); + assertTrue("row 300 should be deleted", + (bitmap[300 / 64] & (1L << (300 % 64))) != 0); + + long[] partialBitmap = newVis.getVisibilityBitmap(180); + assertTrue("row 5 at ts=180 should be deleted", + (partialBitmap[5 / 64] & (1L << (5 % 64))) != 0); + assertFalse("row 10 at ts=180 should NOT be deleted", + (partialBitmap[10 / 64] & (1L << (10 % 64))) != 0); + } + } + + @Test + public void testExportImportRoundTrip() + { + long safeGcTs = 100; + + rgVisibility.deleteRecord(5, 50); + rgVisibility.deleteRecord(10, 80); + rgVisibility.deleteRecord(15, 150); + rgVisibility.deleteRecord(20, 200); + rgVisibility.deleteRecord(300, 250); + + long[] exported = rgVisibility.exportChainItemsAfter(safeGcTs); + assertEquals("3 items exported (ts=150,200,250)", 6, exported.length); + + try (RGVisibility newVis = new RGVisibility(ROW_COUNT, safeGcTs, null)) + { + newVis.importDeletionChain(exported); + + for (long snapTs : new long[]{150, 200, 250, 500}) + { + long[] oldBitmap = rgVisibility.getVisibilityBitmap(snapTs); + long[] newBitmap = newVis.getVisibilityBitmap(snapTs); + + for (int row : new int[]{15, 20, 300}) + { + boolean oldDel = (oldBitmap[row / 64] & (1L << (row % 64))) != 0; + boolean newDel = (newBitmap[row / 64] & (1L << (row % 64))) != 0; + assertEquals("snap_ts=" + snapTs + " row=" + row, oldDel, newDel); + } + + for (int row : new int[]{5, 10}) + { + boolean newDel = (newBitmap[row / 64] & (1L << (row % 64))) != 0; + assertFalse("row " + row + " (ts<=safeGcTs) should NOT be in new at snap=" + snapTs, + newDel); + } + } + } + } } \ No newline at end of file diff --git a/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestRetinaCheckpoint.java b/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestRetinaCheckpoint.java index 2f0606c92e..15ba28ce14 100644 --- a/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestRetinaCheckpoint.java +++ b/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestRetinaCheckpoint.java @@ -22,6 +22,7 @@ import io.pixelsdb.pixels.common.exception.RetinaException; import io.pixelsdb.pixels.common.physical.Storage; import io.pixelsdb.pixels.common.physical.StorageFactory; +import io.pixelsdb.pixels.common.utils.CheckpointFileIO; import io.pixelsdb.pixels.common.utils.ConfigFactory; import io.pixelsdb.pixels.common.utils.RetinaUtils; import org.junit.Before; @@ -31,8 +32,12 @@ import java.io.DataOutputStream; import java.io.IOException; import java.lang.reflect.Field; +import java.lang.reflect.Method; import java.net.InetAddress; +import java.util.Arrays; +import java.util.HashMap; import java.util.Map; +import java.util.concurrent.CompletableFuture; import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; @@ -40,7 +45,10 @@ import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.ThreadLocalRandom; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; /** @@ -59,7 +67,7 @@ public class TestRetinaCheckpoint @Before public void setUp() throws IOException, RetinaException { - testCheckpointDir = ConfigFactory.Instance().getProperty("pixels.retina.checkpoint.dir"); + testCheckpointDir = ConfigFactory.Instance().getProperty("retina.checkpoint.dir"); storage = StorageFactory.Instance().getStorage(testCheckpointDir); if (!storage.exists(testCheckpointDir)) @@ -104,7 +112,7 @@ private String getGcFileName(long timestamp) { public void testRegisterOffload() throws RetinaException, IOException { System.out.println("\n[Test] Starting testRegisterOffload..."); - retinaManager.addVisibility(fileId, rgId, numRows); + retinaManager.addVisibility(fileId, rgId, numRows, 0L, null, false); long timestamp = 100L; // Register offload @@ -129,7 +137,7 @@ public void testRegisterOffload() throws RetinaException, IOException public void testMultipleOffloads() throws RetinaException, IOException { System.out.println("\n[Test] Starting testMultipleOffloads..."); - retinaManager.addVisibility(fileId, rgId, numRows); + retinaManager.addVisibility(fileId, rgId, numRows, 0L, null, false); long timestamp1 = 100L; long timestamp1_dup = 100L; // same timestamp @@ -158,7 +166,7 @@ public void testMultipleOffloads() throws RetinaException, IOException public void testCheckpointRecovery() throws RetinaException, IOException { System.out.println("\n[Test] Starting testCheckpointRecovery..."); - retinaManager.addVisibility(fileId, rgId, numRows); + retinaManager.addVisibility(fileId, rgId, numRows, 0L, null, false); long timestamp = 100L; // 1. Delete row 10 @@ -209,7 +217,7 @@ public void testCheckpointRecovery() throws RetinaException, IOException // 7. Re-add Visibility, at this point it should see that it already exists in rgVisibilityMap System.out.println("Re-adding visibility for file (should skip as it already exists)..."); - retinaManager.addVisibility(fileId, rgId, numRows); + retinaManager.addVisibility(fileId, rgId, numRows, 0L, null, false); // 8. Verify state still correct long[] finalBitmap = retinaManager.queryVisibility(fileId, rgId, timestamp); @@ -221,7 +229,7 @@ public void testCheckpointRecovery() throws RetinaException, IOException public void testCheckpointRetryAfterFailure() throws RetinaException, IOException { System.out.println("\n[Test] Starting testCheckpointRetryAfterFailure..."); - retinaManager.addVisibility(fileId, rgId, numRows); + retinaManager.addVisibility(fileId, rgId, numRows, 0L, null, false); long timestamp = 123L; String expectedFile = resolve(testCheckpointDir, getOffloadFileName(timestamp)); @@ -259,7 +267,7 @@ public void testMultiRGCheckpoint() throws RetinaException, IOException int numRgs = 3; for (int i = 0; i < numRgs; i++) { - retinaManager.addVisibility(fileId, i, numRows); + retinaManager.addVisibility(fileId, i, numRows, 0L, null, false); } long timestamp = 200L; @@ -304,7 +312,7 @@ public void testCheckpointDataIntegrity() throws RetinaException, IOException int numRgs = 5; for (int i = 0; i < numRgs; i++) { - retinaManager.addVisibility(fileId, i, numRows); + retinaManager.addVisibility(fileId, i, numRows, 0L, null, false); } long timestamp = 300L; @@ -324,7 +332,7 @@ public void testCheckpointDataIntegrity() throws RetinaException, IOException public void testConcurrency() throws InterruptedException, RetinaException { System.out.println("\n[Test] Starting testConcurrency with 20 threads..."); - retinaManager.addVisibility(fileId, rgId, numRows); + retinaManager.addVisibility(fileId, rgId, numRows, 0L, null, false); int numThreads = 20; int operationsPerThread = 50; ExecutorService executor = Executors.newFixedThreadPool(numThreads); @@ -402,7 +410,7 @@ public void testCheckpointPerformance() throws RetinaException, IOException, Int System.out.println("[Perf] Populating visibility data..."); for (int i = 0; i < numFiles; i++) { - retinaManager.addVisibility(i, 0, rowsPerRg); + retinaManager.addVisibility(i, 0, rowsPerRg, 0L, null, false); } // 3. Delete Records based on Ratio @@ -555,4 +563,167 @@ private boolean isBitSet(long[] bitmap, int rowIndex) return (bitmap[longIndex] & (1L << bitOffset)) != 0; } + + // ----------------------------------------------------------------------- + // GC checkpoint: completeness + bitmap correctness + // ----------------------------------------------------------------------- + + /** + * Creates a {@code long[]} GC snapshot bitmap for one RG where exactly {@code deletedRows} + * out of {@code totalRows} rows are marked deleted (rows 0..deletedRows-1 are set). + */ + private static long[] makeBitmap(int totalRows, int deletedRows) + { + int words = (totalRows + 63) / 64; + long[] bitmap = new long[words]; + for (int r = 0; r < deletedRows; r++) + { + bitmap[r / 64] |= (1L << (r % 64)); + } + return bitmap; + } + + /** + * Calls {@code RetinaResourceManager.createCheckpoint(ts, CheckpointType.GC, bitmaps)} + * via reflection and blocks until the write completes. + */ + @SuppressWarnings("unchecked") + private void invokeCreateGCCheckpoint(long ts, Map bitmaps) throws Exception + { + // Locate the private CheckpointType enum class + Class cpTypeClass = Arrays.stream(RetinaResourceManager.class.getDeclaredClasses()) + .filter(c -> c.getSimpleName().equals("CheckpointType")) + .findFirst() + .orElseThrow(() -> new RuntimeException("CheckpointType enum not found")); + + // Get the GC constant + Object gcConstant = Arrays.stream(cpTypeClass.getEnumConstants()) + .filter(e -> e.toString().equals("GC")) + .findFirst() + .orElseThrow(() -> new RuntimeException("CheckpointType.GC not found")); + + // Get the overloaded createCheckpoint(long, CheckpointType, Map) method + Method method = RetinaResourceManager.class.getDeclaredMethod( + "createCheckpoint", long.class, cpTypeClass, Map.class); + method.setAccessible(true); + + CompletableFuture future = (CompletableFuture) method.invoke( + retinaManager, ts, gcConstant, bitmaps); + future.join(); + } + + /** + * Verifies that a GC checkpoint written with a full {@code gcSnapshotBitmaps} map + * contains ALL RG entries — including those that would not be selected as Storage GC + * candidates — because the checkpoint is written before S1 scanning begins. + * + *

    Setup: 3 files in {@code rgVisibilityMap}: + *

      + *
    • File A: 80 % deleted (would be a candidate)
    • + *
    • File B: 60 % deleted (would be a candidate)
    • + *
    • File C: 20 % deleted (non-candidate)
    • + *
    + * + *

    Expected: checkpoint rgCount = 3; all three entries present with correct + * {@code recordNum} and bitmap content. + */ + @Test + public void testGCCheckpoint_containsAllRGs() throws Exception + { + final long fileIdA = 77001L; + final long fileIdB = 77002L; + final long fileIdC = 77003L; + final int rows = 100; + final long safeGcTs = 500L; + + retinaManager.addVisibility(fileIdA, 0, rows, 0L, null, false); + retinaManager.addVisibility(fileIdB, 0, rows, 0L, null, false); + retinaManager.addVisibility(fileIdC, 0, rows, 0L, null, false); + + long[] bitmapA = makeBitmap(rows, 80); + long[] bitmapB = makeBitmap(rows, 60); + long[] bitmapC = makeBitmap(rows, 20); + + Map gcBitmaps = new HashMap<>(); + gcBitmaps.put(fileIdA + "_0", bitmapA); + gcBitmaps.put(fileIdB + "_0", bitmapB); + gcBitmaps.put(fileIdC + "_0", bitmapC); + + invokeCreateGCCheckpoint(safeGcTs, gcBitmaps); + + String cpPath = resolve(testCheckpointDir, getGcFileName(safeGcTs)); + assertTrue("GC checkpoint file must exist", storage.exists(cpPath)); + + Map entries = new HashMap<>(); + int rgCount = CheckpointFileIO.readCheckpointParallel(cpPath, + e -> entries.put(e.fileId + "_" + e.rgId, e)); + + assertEquals("checkpoint must contain all 3 RGs (not just candidates)", 3, rgCount); + assertEquals("entries map size must be 3", 3, entries.size()); + + CheckpointFileIO.CheckpointEntry entA = entries.get(fileIdA + "_0"); + assertNotNull("fileIdA must be present", entA); + assertEquals("fileIdA recordNum", rows, entA.recordNum); + assertArrayEquals("fileIdA bitmap must match", bitmapA, entA.bitmap); + + CheckpointFileIO.CheckpointEntry entB = entries.get(fileIdB + "_0"); + assertNotNull("fileIdB must be present", entB); + assertEquals("fileIdB recordNum", rows, entB.recordNum); + assertArrayEquals("fileIdB bitmap must match", bitmapB, entB.bitmap); + + CheckpointFileIO.CheckpointEntry entC = entries.get(fileIdC + "_0"); + assertNotNull("fileIdC (non-candidate) must be present", entC); + assertEquals("fileIdC recordNum", rows, entC.recordNum); + assertArrayEquals("fileIdC bitmap must match", bitmapC, entC.bitmap); + } + + /** + * Verifies that the GC checkpoint bitmap content faithfully matches the + * {@code gcSnapshotBitmaps} passed to {@code createCheckpoint}: each word of each + * per-RG bitmap must be preserved exactly, with no cross-RG contamination. + * + *

    Uses a 2-RG file with deliberately complementary bitmaps: + *

      + *
    • RG 0: first word all-ones ({@code rows 0-63} deleted), second word zero
    • + *
    • RG 1: first word zero, second word all-ones ({@code rows 64-127} deleted)
    • + *
    + */ + @Test + public void testGCCheckpoint_bitmapContentIsExact() throws Exception + { + final long fileId = 88001L; + final int rows = 128; // 2 words per RG + final long safeGcTs = 600L; + + retinaManager.addVisibility(fileId, 0, rows, 0L, null, false); + retinaManager.addVisibility(fileId, 1, rows, 0L, null, false); + + long[] bitmapRg0 = new long[]{-1L, 0L}; // rows 0-63 deleted + long[] bitmapRg1 = new long[]{0L, -1L}; // rows 64-127 deleted + + Map gcBitmaps = new HashMap<>(); + gcBitmaps.put(fileId + "_0", bitmapRg0); + gcBitmaps.put(fileId + "_1", bitmapRg1); + + invokeCreateGCCheckpoint(safeGcTs, gcBitmaps); + + String cpPath = resolve(testCheckpointDir, getGcFileName(safeGcTs)); + assertTrue("GC checkpoint file must exist", storage.exists(cpPath)); + + Map entries = new HashMap<>(); + int rgCount = CheckpointFileIO.readCheckpointParallel(cpPath, + e -> entries.put(e.fileId + "_" + e.rgId, e)); + + assertEquals("checkpoint must contain 2 RGs", 2, rgCount); + + CheckpointFileIO.CheckpointEntry rg0 = entries.get(fileId + "_0"); + assertNotNull("RG 0 must be present", rg0); + assertEquals("RG 0 word 0 must be all-ones (rows 0-63 deleted)", -1L, rg0.bitmap[0]); + assertEquals("RG 0 word 1 must be zero (rows 64-127 live)", 0L, rg0.bitmap[1]); + + CheckpointFileIO.CheckpointEntry rg1 = entries.get(fileId + "_1"); + assertNotNull("RG 1 must be present", rg1); + assertEquals("RG 1 word 0 must be zero (rows 0-63 live)", 0L, rg1.bitmap[0]); + assertEquals("RG 1 word 1 must be all-ones (rows 64-127 deleted)", -1L, rg1.bitmap[1]); + } } diff --git a/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestRetinaResourceManager.java b/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestRetinaResourceManager.java index 0bfa32e455..6edb341693 100644 --- a/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestRetinaResourceManager.java +++ b/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestRetinaResourceManager.java @@ -53,7 +53,7 @@ public void TestVisibility() long fileId = 999; int rgId = 666; int recordNum = 100; - retinaResourceManager.addVisibility(fileId, rgId, recordNum); + retinaResourceManager.addVisibility(fileId, rgId, recordNum, 0L, null, false); long [] visibility = retinaResourceManager.queryVisibility(fileId, rgId, 0); printVisibility(visibility); diff --git a/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestStorageGarbageCollector.java b/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestStorageGarbageCollector.java new file mode 100644 index 0000000000..6281626267 --- /dev/null +++ b/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestStorageGarbageCollector.java @@ -0,0 +1,3782 @@ +/* + * Copyright 2026 PixelsDB. + * + * This file is part of Pixels. + * + * Pixels is free software: you can redistribute it and/or modify + * it under the terms of the Affero GNU General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * Pixels is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Affero GNU General Public License for more details. + * + * You should have received a copy of the Affero GNU General Public + * License along with Pixels. If not, see + * . + */ +package io.pixelsdb.pixels.retina; + +import io.pixelsdb.pixels.common.metadata.MetadataService; +import io.pixelsdb.pixels.common.utils.CheckpointFileIO; +import io.pixelsdb.pixels.common.utils.PixelsFileNameUtils; +import io.pixelsdb.pixels.common.utils.RetinaUtils; +import io.pixelsdb.pixels.common.metadata.domain.Column; +import io.pixelsdb.pixels.common.metadata.domain.File; +import io.pixelsdb.pixels.common.metadata.domain.Layout; +import io.pixelsdb.pixels.common.physical.Storage; +import io.pixelsdb.pixels.common.physical.StorageFactory; +import io.pixelsdb.pixels.core.PixelsFooterCache; +import io.pixelsdb.pixels.core.PixelsReader; +import io.pixelsdb.pixels.core.PixelsReaderImpl; +import io.pixelsdb.pixels.core.PixelsWriter; +import io.pixelsdb.pixels.core.PixelsWriterImpl; +import io.pixelsdb.pixels.core.TypeDescription; +import io.pixelsdb.pixels.core.encoding.EncodingLevel; +import io.pixelsdb.pixels.core.reader.PixelsReaderOption; +import io.pixelsdb.pixels.core.reader.PixelsRecordReader; +import io.pixelsdb.pixels.core.vector.BinaryColumnVector; +import io.pixelsdb.pixels.core.vector.DoubleColumnVector; +import io.pixelsdb.pixels.core.vector.LongColumnVector; +import io.pixelsdb.pixels.core.vector.VectorizedRowBatch; +import org.junit.After; +import org.junit.AfterClass; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Ignore; +import org.junit.Test; + +import java.lang.reflect.Field; +import java.lang.reflect.Method; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.CyclicBarrier; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.atomic.AtomicInteger; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + +/** + * Tests for {@link StorageGarbageCollector}, covering scan/grouping, data rewrite, + * dual-write, visibility sync, index update, atomic switch, and end-to-end integration. + * + *

    All tests use real {@link RetinaResourceManager} (with JNI/C++ native library) + * and real {@link MetadataService} (requires a running metadata server). + * Rewrite tests write Pixels files to a local temp directory using {@code file://} + * URIs resolved by {@link io.pixelsdb.pixels.storage.localfs.LocalFS}. + * + *

    Test naming convention

    + * New tests follow {@code test{Section}_{feature}_{scenario}} where Section maps to: + *
      + *
    • S1 — scan/grouping and file-type filtering
    • + *
    • S2 — data rewrite (single/multi-file, single/multi-RG, hidden columns)
    • + *
    • S3 — dual-write propagation
    • + *
    • S4 — visibility sync (export/import/truncation)
    • + *
    • S5 — index sync
    • + *
    • S6 — atomic swap and deferred cleanup
    • + *
    • E2E — end-to-end integration
    • + *
    + * Legacy test names (pre-convention) are preserved for CI stability. + */ +public class TestStorageGarbageCollector +{ + // ----------------------------------------------------------------------- + // Class-level constants & fields + // ----------------------------------------------------------------------- + + private static final String TEST_SCHEMA = "gc_test"; + private static final String TEST_TABLE = "gc_test_tbl"; + private static final TypeDescription LONG_ID_SCHEMA = + TypeDescription.fromString("struct"); + + private static Path tmpDir; + private static Storage fileStorage; + private static MetadataService metadataService; + private static long testPathId; + private static String testOrderedPathUri; + + // ----------------------------------------------------------------------- + // Per-test fields + // ----------------------------------------------------------------------- + + private RetinaResourceManager retinaManager; + private StorageGarbageCollector gc; + + // ----------------------------------------------------------------------- + // Class-level setup / teardown + // ----------------------------------------------------------------------- + + @BeforeClass + public static void classSetUp() throws Exception + { + tmpDir = Files.createTempDirectory("pixels_gc_test_"); + fileStorage = StorageFactory.Instance().getStorage("file"); + metadataService = MetadataService.Instance(); + + if (metadataService.existSchema(TEST_SCHEMA)) + { + metadataService.dropSchema(TEST_SCHEMA); + } + metadataService.createSchema(TEST_SCHEMA); + Column col = new Column(); + col.setName("id"); + col.setType("long"); + String basePathUri = "file://" + tmpDir.toAbsolutePath(); + metadataService.createTable(TEST_SCHEMA, TEST_TABLE, Storage.Scheme.file, + Collections.singletonList(basePathUri), Collections.singletonList(col)); + Layout layout = metadataService.getLatestLayout(TEST_SCHEMA, TEST_TABLE); + testPathId = layout.getOrderedPaths().get(0).getId(); + testOrderedPathUri = layout.getOrderedPaths().get(0).getUri(); + Files.createDirectories(java.nio.file.Paths.get(testOrderedPathUri.replaceFirst("file://", ""))); + } + + @AfterClass + public static void classTearDown() + { + try + { + if (metadataService != null && metadataService.existSchema(TEST_SCHEMA)) + { + metadataService.dropSchema(TEST_SCHEMA); + } + } + catch (Exception e) + { + System.err.println("Warning: failed to drop test schema " + TEST_SCHEMA + ": " + e.getMessage()); + } + if (tmpDir != null) + { + deleteRecursive(tmpDir.toFile()); + } + } + + // ----------------------------------------------------------------------- + // Per-test setup / teardown + // ----------------------------------------------------------------------- + + @Before + public void setUp() + { + retinaManager = RetinaResourceManager.Instance(); + resetManagerState(); + cleanupOrderedDir(); + gc = new StorageGarbageCollector(retinaManager, metadataService, 0.5, 134_217_728L, Integer.MAX_VALUE, 10, + 1048576, EncodingLevel.EL2, 86_400_000L); + } + + @After + public void tearDown() + { + resetManagerState(); + cleanupOrderedDir(); + } + + /** + * Deletes all {@code .pxl} files from the shared test ordered-path directory after + * each test. Multiple tests write output files (from {@code rewriteFileGroup}) into + * this same directory. Without cleanup, a leftover file whose name was generated by + * {@link io.pixelsdb.pixels.common.utils.PixelsFileNameUtils#buildOrderedFileName} in a + * prior test can collide with the name generated for the current test (same + * {@link io.pixelsdb.pixels.common.utils.DateUtil#getCurTime()} wall-clock second and + * counter value when the JVM is reused across reruns), causing + * {@code LocalFS.create(overwrite=false)} to throw "File already exists" and the whole + * GC pipeline to fail. + */ + private static void cleanupOrderedDir() + { + if (testOrderedPathUri == null) + { + return; + } + java.io.File dir = new java.io.File(testOrderedPathUri.replaceFirst("file://", "")); + if (!dir.exists() || !dir.isDirectory()) + { + return; + } + java.io.File[] files = dir.listFiles(); + if (files == null) + { + return; + } + for (java.io.File f : files) + { + if (f.isFile() && !f.delete()) + { + try { Thread.sleep(50); } catch (InterruptedException ignored) { } + if (!f.delete()) + { + System.err.println("Warning: failed to delete " + f.getAbsolutePath()); + } + } + } + } + + // ======================================================================= + // Section 1: groupAndMerge logic + // ======================================================================= + + /** + * Three candidates from three distinct {@code (tableId, virtualNodeId)} pairs; + * expect three separate groups sorted by invalidRatio descending. + */ + @Test + public void testGroupAndMerge_threeDistinctGroups() + { + StorageGarbageCollector gc = newGcForGrouping(0L, Integer.MAX_VALUE, 10); + + List candidates = Arrays.asList( + new StorageGarbageCollector.FileCandidate(makeFile(1, 1), "f1", 1, 1, 1L, 0, 0.60, 0L), + new StorageGarbageCollector.FileCandidate(makeFile(2, 1), "f2", 2, 1, 1L, 1, 0.70, 0L), + new StorageGarbageCollector.FileCandidate(makeFile(3, 1), "f3", 3, 1, 2L, 0, 0.80, 0L) + ); + + List groups = gc.groupAndMerge(candidates); + + assertEquals("expected 3 groups", 3, groups.size()); + // sorted by avg invalidRatio desc: 0.80, 0.70, 0.60 + assertEquals("group 0 should have highest ratio (0.80)", 0.80, + groups.get(0).files.get(0).invalidRatio, 1e-9); + assertEquals("group 1 should have ratio 0.70", 0.70, + groups.get(1).files.get(0).invalidRatio, 1e-9); + assertEquals("group 2 should have lowest ratio (0.60)", 0.60, + groups.get(2).files.get(0).invalidRatio, 1e-9); + } + + /** + * Two candidates with the same {@code (tableId, virtualNodeId)} must be in one group, + * with files sorted by invalidRatio descending inside the group. + */ + @Test + public void testGroupAndMerge_twoFilesInSameGroup() + { + StorageGarbageCollector gc = newGcForGrouping(0L, Integer.MAX_VALUE, 10); + + List candidates = Arrays.asList( + new StorageGarbageCollector.FileCandidate(makeFile(1, 1), "f1", 1, 1, 1L, 5, 0.60, 0L), + new StorageGarbageCollector.FileCandidate(makeFile(2, 1), "f2", 2, 1, 1L, 5, 0.80, 0L) + ); + + List groups = gc.groupAndMerge(candidates); + + assertEquals("both candidates should form one group", 1, groups.size()); + StorageGarbageCollector.FileGroup grp = groups.get(0); + assertEquals(1L, grp.tableId); + assertEquals(5, grp.virtualNodeId); + assertEquals(2, grp.files.size()); + assertTrue("files within group sorted by invalidRatio desc", + grp.files.get(0).invalidRatio >= grp.files.get(1).invalidRatio); + assertEquals(0.80, grp.files.get(0).invalidRatio, 1e-9); + assertEquals(0.60, grp.files.get(1).invalidRatio, 1e-9); + } + + /** + * When there are more candidate groups than {@code maxFileGroupsPerRun}, + * only the top-N groups (highest average invalidRatio) are returned. + */ + @Test + public void testGroupAndMerge_maxGroupsCap() + { + int max = 3; + StorageGarbageCollector gc = newGcForGrouping(0L, Integer.MAX_VALUE, max); + + // Build 5 groups with different tableIds and clear invalidRatios (0.55..0.99) + List candidates = new ArrayList<>(); + for (int i = 0; i < 5; i++) + { + candidates.add(new StorageGarbageCollector.FileCandidate( + makeFile(i, 1), "f" + i, i, 1, (long) (i + 10), 0, 0.55 + i * 0.11, 0L)); + } + + List groups = gc.groupAndMerge(candidates); + + assertEquals("groups must be capped at maxFileGroupsPerRun", max, groups.size()); + // First group must have the highest average invalidRatio + double firstAvg = groups.get(0).files.stream() + .mapToDouble(c -> c.invalidRatio).average().orElse(0); + double lastAvg = groups.get(groups.size() - 1).files.stream() + .mapToDouble(c -> c.invalidRatio).average().orElse(0); + assertTrue("groups must be sorted best-first", firstAvg >= lastAvg); + } + + /** + * An empty candidate list must produce an empty group list. + */ + @Test + public void testGroupAndMerge_emptyCandidates() + { + StorageGarbageCollector gc = newGcForGrouping(0L, Integer.MAX_VALUE, 10); + List groups = + gc.groupAndMerge(Collections.emptyList()); + assertTrue("empty candidates → empty groups", groups.isEmpty()); + } + + /** + * Greedy splitting: two small files fit within the target size together. + * Each is 30 MB effective → combined 60 MB < 100 MB target → one group. + */ + @Test + public void testGroupAndMerge_greedyMergeFitsInTarget() + { + long target = 100 * 1024 * 1024L; // 100 MB + StorageGarbageCollector gc = newGcForGrouping(target, Integer.MAX_VALUE, 10); + + List candidates = Arrays.asList( + new StorageGarbageCollector.FileCandidate( + makeFile(1, 1), "f1", 1, 1, 1L, 0, 0.70, 100 * 1024 * 1024L), + new StorageGarbageCollector.FileCandidate( + makeFile(2, 1), "f2", 2, 1, 1L, 0, 0.70, 100 * 1024 * 1024L) + ); + + List groups = gc.groupAndMerge(candidates); + + assertEquals("2 files × 30MB effective each < 100MB target → 1 group", 1, groups.size()); + assertEquals(2, groups.get(0).files.size()); + } + + /** + * Greedy splitting: a single file whose effective data exceeds targetFileSize + * must form its own group via the {@code singleFileOversized} branch, + * even if other small files follow. + * File A: 200 MB on disk, 10 % deleted → 180 MB effective > 100 MB target → oversized. + * File B: 50 MB on disk, 50 % deleted → 25 MB effective → fits alone. + */ + @Test + public void testGroupAndMerge_greedyOversizedFileAlone() + { + long target = 100 * 1024 * 1024L; + StorageGarbageCollector gc = newGcForGrouping(target, Integer.MAX_VALUE, 10); + + List candidates = Arrays.asList( + new StorageGarbageCollector.FileCandidate( + makeFile(1, 1), "f1", 1, 1, 1L, 0, 0.10, 200 * 1024 * 1024L), + new StorageGarbageCollector.FileCandidate( + makeFile(2, 1), "f2", 2, 1, 1L, 0, 0.50, 50 * 1024 * 1024L) + ); + + List groups = gc.groupAndMerge(candidates); + + assertEquals("oversized file alone + small file alone → 2 groups", 2, groups.size()); + boolean foundOversizedAlone = false; + for (StorageGarbageCollector.FileGroup g : groups) + { + if (g.files.size() == 1 && g.files.get(0).fileId == 1) + { + foundOversizedAlone = true; + } + } + assertTrue("oversized file (180 MB effective > 100 MB target) must be in its own group", + foundOversizedAlone); + } + + /** + * When fileSizeBytes is 0 (unknown), greedy splitting is effectively disabled: + * all files in the same {@code (tableId, vNodeId)} form a single group. + */ + @Test + public void testGroupAndMerge_greedyFallbackWhenSizeUnknown() + { + long target = 100 * 1024 * 1024L; + StorageGarbageCollector gc = newGcForGrouping(target, Integer.MAX_VALUE, 10); + + // fileSizeBytes = 0 (unknown) — no splitting occurs + List candidates = Arrays.asList( + new StorageGarbageCollector.FileCandidate( + makeFile(1, 1), "f1", 1, 1, 1L, 0, 0.60, 0L), + new StorageGarbageCollector.FileCandidate( + makeFile(2, 1), "f2", 2, 1, 1L, 0, 0.60, 0L) + ); + + List groups = gc.groupAndMerge(candidates); + + assertEquals("unknown size → no splitting → 1 group", 1, groups.size()); + assertEquals(2, groups.get(0).files.size()); + } + + /** + * Five files sharing the same {@code (tableId, virtualNodeId)} with + * maxFilesPerGroup=2 must be split into 3 groups (2+2+1). + * targetFileSize is disabled (0) so only the file count limit applies. + */ + @Test + public void testGroupAndMerge_maxFilesPerGroupSplit() + { + StorageGarbageCollector gc = newGcForGrouping(0L, 2, 100); + + List candidates = new ArrayList<>(); + for (int i = 0; i < 5; i++) + { + candidates.add(new StorageGarbageCollector.FileCandidate( + makeFile(i + 1, 1), "f" + i, i + 1, 1, 1L, 0, 0.90 - i * 0.05, 0L)); + } + + List groups = gc.groupAndMerge(candidates); + + assertEquals("5 files / maxFilesPerGroup=2 → 3 groups", 3, groups.size()); + assertEquals(2, groups.get(0).files.size()); + assertEquals(2, groups.get(1).files.size()); + assertEquals(1, groups.get(2).files.size()); + } + + /** + * Dual-bound: targetFileSize is very large (won't trigger), but maxFilesPerGroup=3 + * triggers first. Six files in same group → 2 groups of 3. + */ + @Test + public void testGroupAndMerge_dualBoundSplitByFileCount() + { + long hugeTarget = Long.MAX_VALUE; + StorageGarbageCollector gc = newGcForGrouping(hugeTarget, 3, 100); + + List candidates = new ArrayList<>(); + for (int i = 0; i < 6; i++) + { + candidates.add(new StorageGarbageCollector.FileCandidate( + makeFile(i + 1, 1), "f" + i, i + 1, 1, 1L, 0, 0.70, + 50 * 1024 * 1024L)); + } + + List groups = gc.groupAndMerge(candidates); + + assertEquals("6 files / maxFilesPerGroup=3 → 2 groups", 2, groups.size()); + assertEquals(3, groups.get(0).files.size()); + assertEquals(3, groups.get(1).files.size()); + } + + /** + * Dual-bound: maxFilesPerGroup is very large (won't trigger), but targetFileSize + * triggers first. Each file has 60MB effective data, target=100MB → each file + * forms its own group. + */ + @Test + public void testGroupAndMerge_dualBoundSplitBySize() + { + long target = 100 * 1024 * 1024L; + StorageGarbageCollector gc = newGcForGrouping(target, Integer.MAX_VALUE, 100); + + List candidates = new ArrayList<>(); + for (int i = 0; i < 3; i++) + { + candidates.add(new StorageGarbageCollector.FileCandidate( + makeFile(i + 1, 1), "f" + i, i + 1, 1, 1L, 0, 0.40, + 100 * 1024 * 1024L)); + } + + List groups = gc.groupAndMerge(candidates); + + assertEquals("3 files × 60MB effective > 100MB target → 3 single-file groups", 3, groups.size()); + for (StorageGarbageCollector.FileGroup g : groups) + { + assertEquals(1, g.files.size()); + } + } + + /** + * A single candidate must produce exactly one group of one file. + */ + @Test + public void testGroupAndMerge_singleCandidate() + { + StorageGarbageCollector gc = newGcForGrouping(0L, Integer.MAX_VALUE, 10); + + List candidates = Collections.singletonList( + new StorageGarbageCollector.FileCandidate(makeFile(1, 1), "f1", 1, 1, 1L, 0, 0.80, 0L)); + + List groups = gc.groupAndMerge(candidates); + + assertEquals("single candidate → 1 group", 1, groups.size()); + assertEquals(1, groups.get(0).files.size()); + assertEquals(0.80, groups.get(0).files.get(0).invalidRatio, 1e-9); + } + + /** + * Greedy splitting: a file whose effective data size exactly equals + * {@code targetFileSize} must NOT be treated as oversized (the check + * uses strict {@code >}, not {@code >=}). Two such files should merge + * until the cumulative effective bytes exceeds the target. + * + *

    File A: 100 MB on disk, 50 % deleted → 50 MB effective == target. + * File B: 100 MB on disk, 50 % deleted → 50 MB effective. + * Cumulative A+B = 100 MB > 50 MB → flush A alone, then B alone → 2 groups. + * But neither is "oversized" individually. + */ + @Test + public void testGroupAndMerge_effectiveSizeExactlyEqualsTarget() + { + long target = 50 * 1024 * 1024L; // 50 MB + StorageGarbageCollector gc = newGcForGrouping(target, Integer.MAX_VALUE, 10); + + List candidates = Arrays.asList( + new StorageGarbageCollector.FileCandidate( + makeFile(1, 1), "f1", 1, 1, 1L, 0, 0.50, 100 * 1024 * 1024L), + new StorageGarbageCollector.FileCandidate( + makeFile(2, 1), "f2", 2, 1, 1L, 0, 0.50, 100 * 1024 * 1024L) + ); + + List groups = gc.groupAndMerge(candidates); + + assertEquals("effective == target → not oversized; A+B > target → 2 groups", 2, groups.size()); + for (StorageGarbageCollector.FileGroup g : groups) + { + assertEquals(1, g.files.size()); + } + } + + /** + * When all groups have the same average invalidRatio, sorting is stable + * and the correct number of groups is returned. + */ + @Test + public void testGroupAndMerge_equalRatioSorting() + { + StorageGarbageCollector gc = newGcForGrouping(0L, Integer.MAX_VALUE, 10); + + List candidates = Arrays.asList( + new StorageGarbageCollector.FileCandidate( + makeFile(1, 1), "f1", 1, 1, 1L, 0, 0.70, 0L), + new StorageGarbageCollector.FileCandidate( + makeFile(2, 1), "f2", 2, 1, 2L, 0, 0.70, 0L), + new StorageGarbageCollector.FileCandidate( + makeFile(3, 1), "f3", 3, 1, 3L, 0, 0.70, 0L) + ); + + List groups = gc.groupAndMerge(candidates); + + assertEquals("3 distinct tableIds, all same ratio → 3 groups", 3, groups.size()); + for (StorageGarbageCollector.FileGroup g : groups) + { + assertEquals(0.70, g.files.get(0).invalidRatio, 1e-9); + } + } + + /** + * When both {@code targetFileSize} and {@code maxFilesPerGroup} are disabled + * (both {@code <= 0}), all files in the same {@code (tableId, virtualNodeId)} + * form a single group — the fast path in {@code splitIntoGroups}. + */ + @Test + public void testGroupAndMerge_bothLimitsDisabled() + { + StorageGarbageCollector gc = newGcForGrouping(0L, 0, 10); + + List candidates = new ArrayList<>(); + for (int i = 0; i < 5; i++) + { + candidates.add(new StorageGarbageCollector.FileCandidate( + makeFile(i + 1, 1), "f" + i, i + 1, 1, 1L, 0, 0.80 - i * 0.05, 0L)); + } + + List groups = gc.groupAndMerge(candidates); + + assertEquals("both limits disabled → all 5 files in 1 group", 1, groups.size()); + assertEquals(5, groups.get(0).files.size()); + } + + // ======================================================================= + // Section 2: threshold filtering via DirectScanStorageGC + // ======================================================================= + + /** + * Three files with deletion ratios 60 %, 40 %, 80 % against threshold=0.5; + * only the 60 % and 80 % files should appear as candidates, and they must be + * grouped by {@code (tableId, virtualNodeId)}. + * + *

    Verifies: threshold=0.5 → 60 % and 80 % selected, 40 % excluded. + */ + @Test + public void testScanAndGroupFiles_thresholdFiltering() + { + int totalRows = 100; + long fileId60 = 60001L; + long fileId40 = 40001L; + long fileId80 = 80001L; + + // Build file-level stats: {totalRows, invalidCount} per file. + Map fileStats = new HashMap<>(); + fileStats.put(fileId60, makeRgStats(totalRows, 60)); // 60 % deleted + fileStats.put(fileId40, makeRgStats(totalRows, 40)); // 40 % deleted + fileStats.put(fileId80, makeRgStats(totalRows, 80)); // 80 % deleted + + // Pre-compute candidate set (threshold=0.5): 60% and 80% qualify + Set candidateFileIds = new HashSet<>(Arrays.asList(fileId60, fileId80)); + + List fakeFiles = Arrays.asList( + new FakeFileEntry(fileId60, 1, 1L, 0), // ratio=0.60, should be selected + new FakeFileEntry(fileId40, 1, 1L, 0), // ratio=0.40, should be excluded + new FakeFileEntry(fileId80, 1, 2L, 0) // ratio=0.80, should be selected (different table) + ); + + DirectScanStorageGC gc = new DirectScanStorageGC( + retinaManager, 0.5, 10, fakeFiles); + + List groups = gc.scanAndGroupFiles(candidateFileIds, fileStats); + + // 60 % and 80 % → 2 separate groups (different tableId: 1 and 2) + assertEquals("2 groups expected (60% and 80%)", 2, groups.size()); + + // Collect all selected fileIds + List selectedIds = new ArrayList<>(); + for (StorageGarbageCollector.FileGroup g : groups) + { + for (StorageGarbageCollector.FileCandidate c : g.files) + { + selectedIds.add(c.fileId); + } + } + + assertTrue("fileId60 should be selected", selectedIds.contains(fileId60)); + assertFalse("fileId40 should NOT be selected (ratio <= threshold)", selectedIds.contains(fileId40)); + assertTrue("fileId80 should be selected", selectedIds.contains(fileId80)); + } + + /** + * Two files sharing the same {@code (tableId, virtualNodeId)} must be in the same group. + */ + @Test + public void testScanAndGroupFiles_sameTableVNodeGroupedTogether() + { + int totalRows = 100; + long fileIdA = 70001L; + long fileIdB = 70002L; + + Map fileStats = new HashMap<>(); + fileStats.put(fileIdA, makeRgStats(totalRows, 60)); // 60 % + fileStats.put(fileIdB, makeRgStats(totalRows, 75)); // 75 % + + Set candidateFileIds = new HashSet<>(Arrays.asList(fileIdA, fileIdB)); + + // Both files belong to same (tableId=5, vNodeId=3) + List fakeFiles = Arrays.asList( + new FakeFileEntry(fileIdA, 1, 5L, 3), + new FakeFileEntry(fileIdB, 1, 5L, 3) + ); + + DirectScanStorageGC gc = new DirectScanStorageGC( + retinaManager, 0.5, 10, fakeFiles); + + List groups = gc.scanAndGroupFiles(candidateFileIds, fileStats); + + assertEquals("both files share (table=5, vNode=3) → 1 group", 1, groups.size()); + assertEquals("group must contain 2 files", 2, groups.get(0).files.size()); + assertEquals(5L, groups.get(0).tableId); + assertEquals(3, groups.get(0).virtualNodeId); + } + + /** + * A file whose fileId has no entry in {@code fileStats} must be skipped + * (totalRows == 0 → excluded regardless of threshold). + */ + @Test + public void testScanAndGroupFiles_skipsFilesWithNoVisibility() + { + long orphanFileId = 99999L; + Map fileStats = new HashMap<>(); + Set candidateFileIds = Collections.singleton(orphanFileId); + + List fakeFiles = Collections.singletonList( + new FakeFileEntry(orphanFileId, 1, 1L, 0)); + + DirectScanStorageGC gc = new DirectScanStorageGC( + retinaManager, 0.5, 10, fakeFiles); + + List groups = gc.scanAndGroupFiles(candidateFileIds, fileStats); + assertTrue("file with no fileStats entry should be skipped", groups.isEmpty()); + } + + // ======================================================================= + // Section 3: runStorageGC bitmap trimming + // ======================================================================= + + /** + * After {@code runStorageGC}, the {@code gcSnapshotBitmaps} map must have had + * non-candidate entries removed. Candidate bitmaps must be retained for the rewrite phase. + */ + @Test + public void testRunStorageGC_trimsBitmapMapToCandidate() + { + long candidateFileId = 66001L; + long otherFileId = 66002L; + + Map bitmaps = new HashMap<>(); + bitmaps.put(candidateFileId + "_0", makeBitmap(100, 60)); + bitmaps.put(otherFileId + "_0", makeBitmap(100, 20)); + + // File-level stats: candidateFileId has 60% deletion, otherFileId has 20% + Map fileStats = new HashMap<>(); + fileStats.put(candidateFileId, makeRgStats(100, 60)); + fileStats.put(otherFileId, makeRgStats(100, 20)); + + List fakeFiles = Arrays.asList( + new FakeFileEntry(candidateFileId, 1, 1L, 0), + new FakeFileEntry(otherFileId, 1, 1L, 0)); + + DirectScanStorageGC gc = new DirectScanStorageGC( + retinaManager, 0.5, 10, fakeFiles); + + gc.runStorageGC(300L, fileStats, bitmaps); + + assertTrue("candidate RG key must be retained", + bitmaps.containsKey(candidateFileId + "_0")); + assertFalse("non-candidate RG key must be removed", + bitmaps.containsKey(otherFileId + "_0")); + } + + // ======================================================================= + // Section 4: runStorageGC end-to-end scan → process + // ======================================================================= + + /** + * A file whose invalidRatio is exactly equal to the threshold (0.5) must NOT + * be selected as a candidate. The design uses strict {@code >}, not {@code >=}. + */ + @Test + public void testRunStorageGC_thresholdExactlyEqual() + { + long fileId = 57001L; + + Map fileStats = new HashMap<>(); + fileStats.put(fileId, makeRgStats(100, 50)); // exactly 50% = threshold + + Map bitmaps = new HashMap<>(); + bitmaps.put(fileId + "_0", makeBitmap(100, 50)); + + DirectScanStorageGC gc = new DirectScanStorageGC( + retinaManager, 0.5, 10, + Collections.singletonList(new FakeFileEntry(fileId, 1, 1L, 0))); + + gc.runStorageGC(400L, fileStats, bitmaps); + + assertTrue("file at exactly threshold must NOT be trimmed (no candidates)", + bitmaps.containsKey(fileId + "_0")); + assertEquals(1, bitmaps.size()); + } + + /** + * A file whose {@code fileStats} entry has {@code totalRows=0} must not + * produce a candidate even if invalidCount is also 0 (division by zero guard). + */ + @Test + public void testRunStorageGC_skipsTotalRowsZero() + { + long fileId = 58001L; + + Map fileStats = new HashMap<>(); + fileStats.put(fileId, new long[]{0, 0}); // totalRows=0 + + Map bitmaps = new HashMap<>(); + bitmaps.put(fileId + "_0", new long[]{0L}); + + DirectScanStorageGC gc = new DirectScanStorageGC( + retinaManager, 0.5, 10, + Collections.singletonList(new FakeFileEntry(fileId, 1, 1L, 0))); + + gc.runStorageGC(500L, fileStats, bitmaps); + + assertTrue("totalRows=0 file must remain untouched (no candidates)", + bitmaps.containsKey(fileId + "_0")); + } + + // ======================================================================= + // Section 4b: processFileGroups error handling + // ======================================================================= + + /** + * When {@code rewriteFileGroup} throws for the first FileGroup, + * {@code processFileGroups} must catch the exception, clean up that + * group's bitmap entries, and continue processing the second group. + * + *

    Uses {@link FailFirstGroupGC} to inject a deterministic failure + * on the first {@code rewriteFileGroup} call while the real + * {@code processFileGroups} loop executes. + */ + @Test + public void testProcessFileGroups_firstGroupFailsSecondContinues() + { + long fileIdA = 88001L; + long fileIdB = 88002L; + + Map bitmaps = new HashMap<>(); + bitmaps.put(fileIdA + "_0", makeBitmap(100, 60)); + bitmaps.put(fileIdB + "_0", makeBitmap(100, 60)); + + StorageGarbageCollector.FileGroup groupA = new StorageGarbageCollector.FileGroup( + 1L, 0, Collections.singletonList( + new StorageGarbageCollector.FileCandidate( + makeFile(fileIdA, 1), "fake_a", fileIdA, 1, 1L, 0, 0.60, 0L))); + StorageGarbageCollector.FileGroup groupB = new StorageGarbageCollector.FileGroup( + 2L, 0, Collections.singletonList( + new StorageGarbageCollector.FileCandidate( + makeFile(fileIdB, 1), "fake_b", fileIdB, 1, 2L, 0, 0.60, 0L))); + + FailFirstGroupGC failGc = new FailFirstGroupGC(); + failGc.processFileGroups(Arrays.asList(groupA, groupB), 300L, bitmaps); + + assertFalse("failed group A's bitmap must be cleaned up by catch block", + bitmaps.containsKey(fileIdA + "_0")); + assertFalse("successful group B's bitmap must be cleaned up by rewrite stub", + bitmaps.containsKey(fileIdB + "_0")); + } + + // ======================================================================= + // Section 5: data rewrite functional tests + // ======================================================================= + + /** + * When there is no bitmap entry for a source file RG, {@code isBitmapBitSet} + * returns false for every row (null bitmap ≡ no deletions) and all rows pass + * through unchanged. + */ + @Test + public void testNullBitmapKeepsAllRows() throws Exception + { + TypeDescription schema = LONG_ID_SCHEMA; + long[] ids = {10L, 20L, 30L, 40L, 50L}; + long fileId = 2L; + String srcPath = writeTestFile("src_null_bitmap.pxl", schema, ids, false, null); + + // Deliberately empty bitmaps map → null bitmap for every RG + Map bitmaps = new HashMap<>(); + + StorageGarbageCollector.RewriteResult result = + gc.rewriteFileGroup(makeGroup(fileId, srcPath, schema), 100L, bitmaps); + + long[][] rows = readAllRows(result.newFilePath, schema, false); + assertEquals("all 5 rows should survive", 5, rows.length); + for (int i = 0; i < 5; i++) + { + assertEquals("id mismatch at row " + i, ids[i], rows[i][0]); + } + + assertRewriteResultConsistency(result, 5); + } + + /** + * When every row in the source RG is marked deleted, {@code rewriteFileGroup} + * deletes the empty output file and returns a sentinel {@code RewriteResult} + * with {@code newFileId == -1}. All forward-mapping entries must be {@code -1}. + */ + @Test + public void testAllRowsDeleted() throws Exception + { + TypeDescription schema = LONG_ID_SCHEMA; + long[] ids = {1L, 2L, 3L, 4L, 5L}; + long fileId = 3L; + String srcPath = writeTestFile("src_all_deleted.pxl", schema, ids, false, null); + + Map bitmaps = new HashMap<>(); + bitmaps.put(fileId + "_0", makeBitmapForRows(5, 0, 1, 2, 3, 4)); + + StorageGarbageCollector.RewriteResult result = + gc.rewriteFileGroup(makeGroup(fileId, srcPath, schema), 100L, bitmaps); + + assertEquals("newFileId should be -1 for all-deleted group", -1, result.newFileId); + assertEquals("newFileRgCount should be 0", 0, result.newFileRgCount); + assertEquals(0, result.newFileRgActualRecordNums.length); + + int[] fwd = result.forwardRgMappings.get(fileId).get(0); + assertNotNull("fwdMapping must exist even for all-deleted RG", fwd); + for (int i = 0; i < fwd.length; i++) + { + assertEquals("all rows deleted → every mapping entry must be -1", -1, fwd[i]); + } + assertFalse("gcSnapshotBitmaps entry must be removed after rewrite", + bitmaps.containsKey(fileId + "_0")); + } + + // ======================================================================= + // Section 5b: multi-file and multi-RG rewrite tests + // ======================================================================= + + /** + * Rewrites a FileGroup containing two source files into one output file. + * File A has rows {100,101,102,103,104}, file B has rows {200,201,202,203,204}. + * Rows 0 and 2 are deleted from file A; rows 1 and 3 from file B. + * Survivors (in order): A:{101,103,104}, B:{200,202,204} → 6 rows total. + * Verifies output data, forward mappings for both files, and bitmap cleanup. + */ + @Test + public void testMultiFileGroupRewrite() throws Exception + { + TypeDescription schema = LONG_ID_SCHEMA; + long fileIdA = 10L; + long fileIdB = 11L; + long[] idsA = {100L, 101L, 102L, 103L, 104L}; + long[] idsB = {200L, 201L, 202L, 203L, 204L}; + + String pathA = writeTestFile("src_multi_a.pxl", schema, idsA, false, null); + String pathB = writeTestFile("src_multi_b.pxl", schema, idsB, false, null); + + Map bitmaps = new HashMap<>(); + bitmaps.put(fileIdA + "_0", makeBitmapForRows(5, 0, 2)); + bitmaps.put(fileIdB + "_0", makeBitmapForRows(5, 1, 3)); + + StorageGarbageCollector.FileGroup group = + makeMultiFileGroup(schema, fileIdA, pathA, fileIdB, pathB); + + StorageGarbageCollector.RewriteResult result = + gc.rewriteFileGroup(group, 100L, bitmaps); + + long[][] rows = readAllRows(result.newFilePath, schema, false); + assertEquals("6 rows should survive across two files", 6, rows.length); + long[] expectedIds = {101L, 103L, 104L, 200L, 202L, 204L}; + for (int i = 0; i < expectedIds.length; i++) + { + assertEquals("id mismatch at row " + i, expectedIds[i], rows[i][0]); + } + + // Forward mapping for file A: row 0 → -1, 1 → 0, 2 → -1, 3 → 1, 4 → 2 + int[] fwdA = result.forwardRgMappings.get(fileIdA).get(0); + assertEquals(-1, fwdA[0]); + assertEquals(0, fwdA[1]); + assertEquals(-1, fwdA[2]); + assertEquals(1, fwdA[3]); + assertEquals(2, fwdA[4]); + + // Forward mapping for file B: row 0 → 3, 1 → -1, 2 → 4, 3 → -1, 4 → 5 + int[] fwdB = result.forwardRgMappings.get(fileIdB).get(0); + assertEquals(3, fwdB[0]); + assertEquals(-1, fwdB[1]); + assertEquals(4, fwdB[2]); + assertEquals(-1, fwdB[3]); + assertEquals(5, fwdB[4]); + + assertFalse("bitmap A must be removed", bitmaps.containsKey(fileIdA + "_0")); + assertFalse("bitmap B must be removed", bitmaps.containsKey(fileIdB + "_0")); + + assertRewriteResultConsistency(result, 6); + } + + /** + * Rewrites a source file containing multiple row groups (forced by a tiny + * rowGroupSize). Rows are deleted from different RGs and the output must + * contain only survivors with correct data and forward mappings that + * reference per-RG arrays. + */ + @Test + public void testMultiRgRewrite() throws Exception + { + TypeDescription schema = LONG_ID_SCHEMA; + long fileId = 20L; + + // Write a file with 2 row groups: RG0 has ids {0..4}, RG1 has ids {5..9} + String srcPath = writeMultiRgTestFile("src_multi_rg.pxl", schema, + new long[][]{{0L, 1L, 2L, 3L, 4L}, {5L, 6L, 7L, 8L, 9L}}, + false, null); + + // Verify the source actually has 2 RGs + int srcRgCount; + try (PixelsReader r = PixelsReaderImpl.newBuilder() + .setStorage(fileStorage).setPath(srcPath) + .setPixelsFooterCache(new PixelsFooterCache()).build()) + { + srcRgCount = r.getRowGroupNum(); + } + assertEquals("source file must have 2 row groups", 2, srcRgCount); + + // Delete rows 1,3 from RG0 and rows 0,4 from RG1 + Map bitmaps = new HashMap<>(); + bitmaps.put(fileId + "_0", makeBitmapForRows(5, 1, 3)); + bitmaps.put(fileId + "_1", makeBitmapForRows(5, 0, 4)); + + StorageGarbageCollector.FileGroup group = makeGroup(fileId, srcPath, schema); + + StorageGarbageCollector.RewriteResult result = + gc.rewriteFileGroup(group, 100L, bitmaps); + + long[][] rows = readAllRows(result.newFilePath, schema, false); + assertEquals("6 rows should survive across 2 RGs", 6, rows.length); + long[] expectedIds = {0L, 2L, 4L, 6L, 7L, 8L}; + for (int i = 0; i < expectedIds.length; i++) + { + assertEquals("id mismatch at row " + i, expectedIds[i], rows[i][0]); + } + + // Forward mapping for RG0: row 0→0, 1→-1, 2→1, 3→-1, 4→2 + int[] fwdRg0 = result.forwardRgMappings.get(fileId).get(0); + assertNotNull("fwdMapping for rg0 must exist", fwdRg0); + assertEquals(0, fwdRg0[0]); + assertEquals(-1, fwdRg0[1]); + assertEquals(1, fwdRg0[2]); + assertEquals(-1, fwdRg0[3]); + assertEquals(2, fwdRg0[4]); + + // Forward mapping for RG1: row 0→-1, 1→3, 2→4, 3→5, 4→-1 + int[] fwdRg1 = result.forwardRgMappings.get(fileId).get(1); + assertNotNull("fwdMapping for rg1 must exist", fwdRg1); + assertEquals(-1, fwdRg1[0]); + assertEquals(3, fwdRg1[1]); + assertEquals(4, fwdRg1[2]); + assertEquals(5, fwdRg1[3]); + assertEquals(-1, fwdRg1[4]); + + assertFalse("bitmap rg0 must be removed", bitmaps.containsKey(fileId + "_0")); + assertFalse("bitmap rg1 must be removed", bitmaps.containsKey(fileId + "_1")); + + assertRewriteResultConsistency(result, 6); + } + + /** + * Verifies backward mapping correctness for a multi-RG source file. + * For every surviving new row, the backward mapping must point to the + * correct old-file global row offset, and the round-trip through forward + * then backward must be consistent. + * + * Setup (same as {@link #testMultiRgRewrite}): + * RG0: rows {0,1,2,3,4}, delete 1,3 → survivors: 0,2,4 → new global 0,1,2 + * RG1: rows {5,6,7,8,9}, delete 0,4 → survivors: 6,7,8 → new global 3,4,5 + * oldFileRgRowStart = [0, 5, 10] + * + * Expected backward mapping (newGlobal → oldGlobal): + * 0→0, 1→2, 2→4, 3→6, 4→7, 5→8 + */ + @Test + public void testMultiRgRewrite_backwardMappingCorrectness() throws Exception + { + TypeDescription schema = LONG_ID_SCHEMA; + long fileId = 22L; + + String srcPath = writeMultiRgTestFile("src_bwd_map.pxl", schema, + new long[][]{{0L, 1L, 2L, 3L, 4L}, {5L, 6L, 7L, 8L, 9L}}, + false, null); + + Map bitmaps = new HashMap<>(); + bitmaps.put(fileId + "_0", makeBitmapForRows(5, 1, 3)); + bitmaps.put(fileId + "_1", makeBitmapForRows(5, 0, 4)); + + StorageGarbageCollector.FileGroup group = makeGroup(fileId, srcPath, schema); + StorageGarbageCollector.RewriteResult result = + gc.rewriteFileGroup(group, 100L, bitmaps); + + assertRewriteResultConsistency(result, 6); + assertEquals("should have exactly 1 BackwardInfo (single source file)", + 1, result.backwardInfos.size()); + + StorageGarbageCollector.BackwardInfo bwd = result.backwardInfos.get(0); + assertEquals("backwardInfo should reference the source file", fileId, bwd.oldFileId); + + // oldFileRgRowStart: RG0 starts at 0 (5 rows), RG1 starts at 5 (5 rows), sentinel=10 + assertEquals(0, bwd.oldFileRgRowStart[0]); + assertEquals(5, bwd.oldFileRgRowStart[1]); + assertEquals(10, bwd.oldFileRgRowStart[2]); + + // Collect all backward mapping entries: newGlobal → oldGlobal + int[] expectedOldGlobal = {0, 2, 4, 6, 7, 8}; + int newGlobal = 0; + for (int newRgId = 0; newRgId < result.newFileRgCount; newRgId++) + { + int[] bwdMapping = bwd.backwardRgMappings.get(newRgId); + if (bwdMapping == null) + { + newGlobal += result.newFileRgActualRecordNums[newRgId]; + continue; + } + for (int newRgOff = 0; newRgOff < bwdMapping.length; newRgOff++, newGlobal++) + { + int oldGlobal = bwdMapping[newRgOff]; + assertTrue("backward mapping entry must not be -1 for surviving row at newGlobal=" + newGlobal, + oldGlobal >= 0); + assertEquals("backward mapping mismatch at newGlobal=" + newGlobal, + expectedOldGlobal[newGlobal], oldGlobal); + } + } + assertEquals("total backward-mapped rows must equal total surviving rows", + 6, newGlobal); + + // Round-trip consistency: for each old row, forward then backward should be identity + Map fwdMappings = result.forwardRgMappings.get(fileId); + for (int oldRgId = 0; oldRgId < 2; oldRgId++) + { + int[] fwdMapping = fwdMappings.get(oldRgId); + for (int oldOff = 0; oldOff < fwdMapping.length; oldOff++) + { + int fwdGlobal = fwdMapping[oldOff]; + if (fwdGlobal < 0) + { + continue; + } + int fwdNewRgId = RetinaResourceManager.rgIdForGlobalRowOffset(fwdGlobal, result.newFileRgRowStart); + int fwdNewRgOff = fwdGlobal - result.newFileRgRowStart[fwdNewRgId]; + int roundTrip = bwd.backwardRgMappings.get(fwdNewRgId)[fwdNewRgOff]; + int expectedGlobal = bwd.oldFileRgRowStart[oldRgId] + oldOff; + assertEquals("round-trip oldRg=" + oldRgId + " oldOff=" + oldOff, + expectedGlobal, roundTrip); + } + } + } + + // ======================================================================= + // Section 5c: edge-case rewrite tests + // ======================================================================= + + /** + * Bitmap word-boundary correctness: 128 rows, delete rows at positions + * 0, 63, 64, 127 (crossing the 64-bit word boundary). Survivors are all + * other 124 rows. Verifies data and forward mapping at boundary positions. + */ + @Test + public void testBitmapWordBoundaryFiltering() throws Exception + { + TypeDescription schema = LONG_ID_SCHEMA; + int totalRows = 128; + long[] ids = new long[totalRows]; + for (int i = 0; i < totalRows; i++) + { + ids[i] = i; + } + long fileId = 32L; + String srcPath = writeTestFile("src_boundary.pxl", schema, ids, false, null); + + Map bitmaps = new HashMap<>(); + bitmaps.put(fileId + "_0", makeBitmapForRows(totalRows, 0, 63, 64, 127)); + + StorageGarbageCollector.RewriteResult result = + gc.rewriteFileGroup(makeGroup(fileId, srcPath, schema), 100L, bitmaps); + + long[][] rows = readAllRows(result.newFilePath, schema, false); + assertEquals("124 rows should survive (128 - 4 deleted)", 124, rows.length); + + Set deletedSet = new HashSet<>(Arrays.asList(0L, 63L, 64L, 127L)); + int survivorIdx = 0; + for (int i = 0; i < totalRows; i++) + { + if (!deletedSet.contains((long) i)) + { + assertEquals("survivor data mismatch at original row " + i, + (long) i, rows[survivorIdx][0]); + survivorIdx++; + } + } + assertEquals(124, survivorIdx); + + int[] fwd = result.forwardRgMappings.get(fileId).get(0); + assertEquals(-1, fwd[0]); + assertEquals(-1, fwd[63]); + assertEquals(-1, fwd[64]); + assertEquals(-1, fwd[127]); + assertTrue("row 1 should map to a valid new offset", fwd[1] >= 0); + assertTrue("row 65 should map to a valid new offset", fwd[65] >= 0); + + assertRewriteResultConsistency(result, 124); + } + + /** + * Multi-file group where file A has all rows deleted and file B has + * survivors. Verifies mixed forward mappings and that the output + * contains only B's surviving rows. + */ + @Test + public void testMultiFileGroupRewrite_oneFileAllDeleted() throws Exception + { + TypeDescription schema = LONG_ID_SCHEMA; + long fileIdA = 40L; + long fileIdB = 41L; + long[] idsA = {10L, 11L, 12L}; + long[] idsB = {20L, 21L, 22L}; + + String pathA = writeTestFile("src_mix_all_del_a.pxl", schema, idsA, false, null); + String pathB = writeTestFile("src_mix_all_del_b.pxl", schema, idsB, false, null); + + Map bitmaps = new HashMap<>(); + bitmaps.put(fileIdA + "_0", makeBitmapForRows(3, 0, 1, 2)); + bitmaps.put(fileIdB + "_0", makeBitmapForRows(3, 1)); + + StorageGarbageCollector.FileGroup group = + makeMultiFileGroup(schema, fileIdA, pathA, fileIdB, pathB); + + StorageGarbageCollector.RewriteResult result = + gc.rewriteFileGroup(group, 100L, bitmaps); + + long[][] rows = readAllRows(result.newFilePath, schema, false); + assertEquals("only B's 2 survivors should be in output", 2, rows.length); + assertEquals(20L, rows[0][0]); + assertEquals(22L, rows[1][0]); + + int[] fwdA = result.forwardRgMappings.get(fileIdA).get(0); + for (int i = 0; i < fwdA.length; i++) + { + assertEquals("file A all deleted → every mapping must be -1", -1, fwdA[i]); + } + + int[] fwdB = result.forwardRgMappings.get(fileIdB).get(0); + assertEquals(0, fwdB[0]); + assertEquals(-1, fwdB[1]); + assertEquals(1, fwdB[2]); + + assertRewriteResultConsistency(result, 2); + } + + /** + * Multi-file group where BOTH files have ALL rows deleted. + * Triggers the {@code globalNewRowOffset == 0} path with multiple source files: + * the empty output file is deleted and {@code newFileId == -1}. + * Forward mappings for both files must be all {@code -1}. + */ + @Test + public void testMultiFileGroupRewrite_allFilesAllDeleted() throws Exception + { + TypeDescription schema = LONG_ID_SCHEMA; + long fileIdA = 42L; + long fileIdB = 43L; + long[] idsA = {10L, 11L, 12L}; + long[] idsB = {20L, 21L, 22L}; + + String pathA = writeTestFile("src_all_del_multi_a.pxl", schema, idsA, false, null); + String pathB = writeTestFile("src_all_del_multi_b.pxl", schema, idsB, false, null); + + Map bitmaps = new HashMap<>(); + bitmaps.put(fileIdA + "_0", makeBitmapForRows(3, 0, 1, 2)); + bitmaps.put(fileIdB + "_0", makeBitmapForRows(3, 0, 1, 2)); + + StorageGarbageCollector.FileGroup group = + makeMultiFileGroup(schema, fileIdA, pathA, fileIdB, pathB); + + StorageGarbageCollector.RewriteResult result = + gc.rewriteFileGroup(group, 100L, bitmaps); + + assertEquals("newFileId should be -1 for all-deleted multi-file group", -1, result.newFileId); + assertEquals("newFileRgCount should be 0", 0, result.newFileRgCount); + assertEquals(0, result.newFileRgActualRecordNums.length); + + int[] fwdA = result.forwardRgMappings.get(fileIdA).get(0); + for (int i = 0; i < fwdA.length; i++) + { + assertEquals("file A: all rows deleted → mapping must be -1", -1, fwdA[i]); + } + + int[] fwdB = result.forwardRgMappings.get(fileIdB).get(0); + for (int i = 0; i < fwdB.length; i++) + { + assertEquals("file B: all rows deleted → mapping must be -1", -1, fwdB[i]); + } + + assertFalse("bitmap A must be removed", bitmaps.containsKey(fileIdA + "_0")); + assertFalse("bitmap B must be removed", bitmaps.containsKey(fileIdB + "_0")); + } + + /** + * Non-null all-zero bitmap: the bitmap exists but has no bits set (no deletions). + * All rows must survive. This exercises a distinct code path from a null bitmap: + * {@code gcBitmap != null} evaluates to {@code true} but no bit check succeeds. + */ + @Test + public void testAllZeroBitmapKeepsAllRows() throws Exception + { + TypeDescription schema = LONG_ID_SCHEMA; + long[] ids = {10L, 20L, 30L, 40L, 50L}; + long fileId = 33L; + String srcPath = writeTestFile("src_allzero_bitmap.pxl", schema, ids, false, null); + + Map bitmaps = new HashMap<>(); + bitmaps.put(fileId + "_0", new long[]{0L}); + + StorageGarbageCollector.RewriteResult result = + gc.rewriteFileGroup(makeGroup(fileId, srcPath, schema), 100L, bitmaps); + + long[][] rows = readAllRows(result.newFilePath, schema, false); + assertEquals("all 5 rows should survive with all-zero bitmap", 5, rows.length); + for (int i = 0; i < 5; i++) + { + assertEquals("id mismatch at row " + i, ids[i], rows[i][0]); + } + + int[] fwd = result.forwardRgMappings.get(fileId).get(0); + for (int i = 0; i < fwd.length; i++) + { + assertEquals("no deletions → mapping must be identity", i, fwd[i]); + } + + assertRewriteResultConsistency(result, 5); + } + + /** + * Multi-RG file where one RG has ALL rows deleted and the other has survivors. + * RG0: rows {0,1,2} all deleted → 0 survivors from RG0. + * RG1: rows {3,4,5}, delete row 0 → survivors 4,5 → new global offsets 0,1. + * Verifies output data, forward mappings per RG, backward mapping, and that + * the all-deleted RG does not produce any output. + */ + @Test + public void testMultiRgRewrite_oneRgCompletelyDeleted() throws Exception + { + TypeDescription schema = LONG_ID_SCHEMA; + long fileId = 35L; + + String srcPath = writeMultiRgTestFile("src_rg_all_del.pxl", schema, + new long[][]{{0L, 1L, 2L}, {3L, 4L, 5L}}, + false, null); + + int srcRgCount; + try (PixelsReader r = PixelsReaderImpl.newBuilder() + .setStorage(fileStorage).setPath(srcPath) + .setPixelsFooterCache(new PixelsFooterCache()).build()) + { + srcRgCount = r.getRowGroupNum(); + } + assertEquals("source file must have 2 row groups", 2, srcRgCount); + + Map bitmaps = new HashMap<>(); + bitmaps.put(fileId + "_0", makeBitmapForRows(3, 0, 1, 2)); + bitmaps.put(fileId + "_1", makeBitmapForRows(3, 0)); + + StorageGarbageCollector.FileGroup group = makeGroup(fileId, srcPath, schema); + + StorageGarbageCollector.RewriteResult result = + gc.rewriteFileGroup(group, 100L, bitmaps); + + long[][] rows = readAllRows(result.newFilePath, schema, false); + assertEquals("2 rows should survive (RG0 all deleted, RG1: 2 survivors)", 2, rows.length); + assertEquals(4L, rows[0][0]); + assertEquals(5L, rows[1][0]); + + int[] fwdRg0 = result.forwardRgMappings.get(fileId).get(0); + for (int i = 0; i < fwdRg0.length; i++) + { + assertEquals("RG0 all deleted → every mapping must be -1", -1, fwdRg0[i]); + } + + int[] fwdRg1 = result.forwardRgMappings.get(fileId).get(1); + assertEquals(-1, fwdRg1[0]); + assertEquals(0, fwdRg1[1]); + assertEquals(1, fwdRg1[2]); + + assertFalse("bitmap rg0 must be removed", bitmaps.containsKey(fileId + "_0")); + assertFalse("bitmap rg1 must be removed", bitmaps.containsKey(fileId + "_1")); + + assertRewriteResultConsistency(result, 2); + + assertEquals(1, result.backwardInfos.size()); + StorageGarbageCollector.BackwardInfo bwd = result.backwardInfos.get(0); + assertEquals(0, bwd.oldFileRgRowStart[0]); + assertEquals(3, bwd.oldFileRgRowStart[1]); + assertEquals(6, bwd.oldFileRgRowStart[2]); + + int globalIdx = 0; + for (int newRgId = 0; newRgId < result.newFileRgCount; newRgId++) + { + int[] bwdMapping = bwd.backwardRgMappings.get(newRgId); + if (bwdMapping == null) + { + continue; + } + for (int off = 0; off < bwdMapping.length; off++, globalIdx++) + { + assertTrue("backward mapping entry should be valid", + bwdMapping[off] >= 0); + } + } + assertEquals(2, globalIdx); + } + + /** + * Multi-file group backward mapping correctness: two files, each with 5 rows. + * File A: delete rows 0,4 → survivors 1,2,3 → new global 0,1,2 + * File B: delete rows 1,3 → survivors 0,2,4 → new global 3,4,5 + * + * Backward mapping per old file: + * File A: oldFileRgRowStart = [0, 5], newGlobal 0→old 1, 1→old 2, 2→old 3 + * File B: oldFileRgRowStart = [0, 5], newGlobal 3→old 0, 4→old 2, 5→old 4 + * Round-trip: forward(old) → backward(new) must be identity. + */ + @Test + public void testMultiFileGroupRewrite_backwardMapping() throws Exception + { + TypeDescription schema = LONG_ID_SCHEMA; + long fileIdA = 36L; + long fileIdB = 37L; + long[] idsA = {100L, 101L, 102L, 103L, 104L}; + long[] idsB = {200L, 201L, 202L, 203L, 204L}; + + String pathA = writeTestFile("src_mf_bwd_a.pxl", schema, idsA, false, null); + String pathB = writeTestFile("src_mf_bwd_b.pxl", schema, idsB, false, null); + + Map bitmaps = new HashMap<>(); + bitmaps.put(fileIdA + "_0", makeBitmapForRows(5, 0, 4)); + bitmaps.put(fileIdB + "_0", makeBitmapForRows(5, 1, 3)); + + StorageGarbageCollector.FileGroup group = + makeMultiFileGroup(schema, fileIdA, pathA, fileIdB, pathB); + + StorageGarbageCollector.RewriteResult result = + gc.rewriteFileGroup(group, 100L, bitmaps); + + assertRewriteResultConsistency(result, 6); + assertEquals("should have 2 BackwardInfos (two source files)", + 2, result.backwardInfos.size()); + + for (StorageGarbageCollector.BackwardInfo bwd : result.backwardInfos) + { + long oldFileId = bwd.oldFileId; + Map fwdMappings = result.forwardRgMappings.get(oldFileId); + assertNotNull("forward mappings must exist for oldFileId=" + oldFileId, fwdMappings); + + for (int oldRgId = 0; oldRgId < bwd.oldFileRgRowStart.length - 1; oldRgId++) + { + int[] fwdMapping = fwdMappings.get(oldRgId); + if (fwdMapping == null) + { + continue; + } + for (int oldOff = 0; oldOff < fwdMapping.length; oldOff++) + { + int fwdGlobal = fwdMapping[oldOff]; + if (fwdGlobal < 0) + { + continue; + } + int newRgId = RetinaResourceManager.rgIdForGlobalRowOffset( + fwdGlobal, result.newFileRgRowStart); + int newRgOff = fwdGlobal - result.newFileRgRowStart[newRgId]; + int[] bwdMapping = bwd.backwardRgMappings.get(newRgId); + assertNotNull("backward mapping must exist for newRgId=" + newRgId, bwdMapping); + int roundTrip = bwdMapping[newRgOff]; + int expectedGlobal = bwd.oldFileRgRowStart[oldRgId] + oldOff; + assertEquals("round-trip file=" + oldFileId + " oldRg=" + oldRgId + + " oldOff=" + oldOff, + expectedGlobal, roundTrip); + } + } + } + + int[] fwdA = result.forwardRgMappings.get(fileIdA).get(0); + assertEquals(-1, fwdA[0]); + assertEquals(0, fwdA[1]); + assertEquals(1, fwdA[2]); + assertEquals(2, fwdA[3]); + assertEquals(-1, fwdA[4]); + + int[] fwdB = result.forwardRgMappings.get(fileIdB).get(0); + assertEquals(3, fwdB[0]); + assertEquals(-1, fwdB[1]); + assertEquals(4, fwdB[2]); + assertEquals(-1, fwdB[3]); + assertEquals(5, fwdB[4]); + } + + // ======================================================================= + // Section 6: dual-write functional tests + // ======================================================================= + + /** + * After unregisterDualWrite, deletes no longer propagate. + */ + @Test + public void testDualWrite_unregisterStops() throws Exception + { + TypeDescription schema = LONG_ID_SCHEMA; + long[] ids = {0L, 1L, 2L, 3L}; + long fileId = 32L; + String srcPath = writeTestFile("dw_unreg_src.pxl", schema, ids, false, null); + + Map bitmaps = new HashMap<>(); + bitmaps.put(fileId + "_0", makeBitmapForRows(4, 0)); + + retinaManager.addVisibility(fileId, 0, 4, 0L, null, false); + + StorageGarbageCollector.RewriteResult result = + gc.rewriteFileGroup(makeGroup(fileId, srcPath, schema), 100L, bitmaps); + long newFileId = result.newFileId; + + gc.registerDualWrite(result); + gc.unregisterDualWrite(result); + + // Old file row 1 → new file row 0 (first survivor). + // After unregister, delete should NOT propagate. + long deleteTs = 200L; + retinaManager.deleteRecord(fileId, 0, 1, deleteTs); + + // Old file: row 1 should be deleted (direct write always works) + long[] oldBitmap = retinaManager.queryVisibility(fileId, 0, deleteTs, 0L); + assertTrue("old file row 1 should be deleted (direct write)", + (oldBitmap[1 / 64] & (1L << (1 % 64))) != 0); + + // New file: row 0 should NOT be deleted (dual-write is off) + long[] newBitmap = retinaManager.queryVisibility(newFileId, 0, deleteTs, 0L); + assertFalse("new file row 0 should NOT be deleted after unregister", + (newBitmap[0 / 64] & (1L << (0 % 64))) != 0); + } + + // ======================================================================= + // Section 7b: rgIdForGlobalRowOffset boundary tests + // ======================================================================= + + /** + * Multiple RGs: offsets on exact boundaries should return the correct rgId. + * rgRowStart = [0, 100, 250, 400] → RG0=[0..99], RG1=[100..249], RG2=[250..399] + */ + @Test + public void testRgIdForGlobalRowOffset_boundaries() + { + int[] rgRowStart = {0, 100, 250, 400}; + assertEquals(0, RetinaResourceManager.rgIdForGlobalRowOffset(0, rgRowStart)); + assertEquals(0, RetinaResourceManager.rgIdForGlobalRowOffset(99, rgRowStart)); + assertEquals(1, RetinaResourceManager.rgIdForGlobalRowOffset(100, rgRowStart)); + assertEquals(1, RetinaResourceManager.rgIdForGlobalRowOffset(249, rgRowStart)); + assertEquals(2, RetinaResourceManager.rgIdForGlobalRowOffset(250, rgRowStart)); + assertEquals(2, RetinaResourceManager.rgIdForGlobalRowOffset(399, rgRowStart)); + } + + /** + * Many equal-sized RGs: stress the binary search across many intervals. + */ + @Test + public void testRgIdForGlobalRowOffset_manyRgs() + { + int numRgs = 64; + int rowsPerRg = 256; + int[] rgRowStart = new int[numRgs + 1]; + for (int i = 0; i <= numRgs; i++) + { + rgRowStart[i] = i * rowsPerRg; + } + for (int rg = 0; rg < numRgs; rg++) + { + int first = rg * rowsPerRg; + int last = first + rowsPerRg - 1; + assertEquals("first offset of rg=" + rg, rg, + RetinaResourceManager.rgIdForGlobalRowOffset(first, rgRowStart)); + assertEquals("last offset of rg=" + rg, rg, + RetinaResourceManager.rgIdForGlobalRowOffset(last, rgRowStart)); + } + } + + // ======================================================================= + // Section 7c: createCheckpointDirect vs createCheckpoint consistency + // ======================================================================= + + /** + * Both checkpoint paths (queued via rgVisibilityMap traversal and direct via + * pre-built entries) must produce byte-identical files when given the same + * visibility state. + */ + @Test + public void testCheckpointDirect_matchesStandardCheckpoint() throws Exception + { + long ts = 500L; + int numFiles = 3; + int rowsPerRg = 64; + + for (int fid = 1; fid <= numFiles; fid++) + { + retinaManager.addVisibility(fid, 0, rowsPerRg, 0L, null, false); + for (int d = 0; d < fid; d++) + { + retinaManager.deleteRecord(fid, 0, d, ts - 100); + } + } + + // Build pre-built entries identical to what runGC() would construct. + List entries = new ArrayList<>(); + Field rgMapField = RetinaResourceManager.class.getDeclaredField("rgVisibilityMap"); + rgMapField.setAccessible(true); + @SuppressWarnings("unchecked") + Map rgMap = + (Map) rgMapField.get(retinaManager); + for (Map.Entry e : rgMap.entrySet()) + { + long fileId = RetinaUtils.parseFileIdFromRgKey(e.getKey()); + int rgId = RetinaUtils.parseRgIdFromRgKey(e.getKey()); + long[] bitmap = e.getValue().getVisibilityBitmap(ts); + entries.add(new CheckpointFileIO.CheckpointEntry( + fileId, rgId, (int) e.getValue().getRecordNum(), bitmap)); + } + + // Obtain the private CheckpointType.GC enum value via reflection. + @SuppressWarnings("unchecked") + Class> checkpointTypeClass = (Class>) + Class.forName("io.pixelsdb.pixels.retina.RetinaResourceManager$CheckpointType"); + Object gcType = null; + for (Object constant : checkpointTypeClass.getEnumConstants()) + { + if (constant.toString().equals("GC")) + { + gcType = constant; + break; + } + } + assertNotNull("CheckpointType.GC must exist", gcType); + + // Call createCheckpoint (standard path) + Method createCheckpointMethod = RetinaResourceManager.class.getDeclaredMethod( + "createCheckpoint", long.class, checkpointTypeClass); + createCheckpointMethod.setAccessible(true); + @SuppressWarnings("unchecked") + CompletableFuture f1 = (CompletableFuture) createCheckpointMethod.invoke( + retinaManager, ts, gcType); + f1.join(); + + // Call createCheckpointDirect (optimized path) with a different timestamp to get a different file name + long ts2 = ts + 1; + Method createCheckpointDirectMethod = RetinaResourceManager.class.getDeclaredMethod( + "createCheckpointDirect", long.class, checkpointTypeClass, List.class); + createCheckpointDirectMethod.setAccessible(true); + @SuppressWarnings("unchecked") + CompletableFuture f2 = (CompletableFuture) createCheckpointDirectMethod.invoke( + retinaManager, ts2, gcType, entries); + f2.join(); + + // Read both checkpoint files and compare entries. + // Files may have entries in different order (due to producer-consumer concurrency), + // so we normalize by sorting entries by (fileId, rgId) before comparing. + Field checkpointDirField = RetinaResourceManager.class.getDeclaredField("checkpointDir"); + checkpointDirField.setAccessible(true); + String checkpointDir = (String) checkpointDirField.get(retinaManager); + + Field hostField = RetinaResourceManager.class.getDeclaredField("retinaHostName"); + hostField.setAccessible(true); + String hostName = (String) hostField.get(retinaManager); + + String path1 = RetinaUtils.buildCheckpointPath( + checkpointDir, RetinaUtils.CHECKPOINT_PREFIX_GC, hostName, ts); + String path2 = RetinaUtils.buildCheckpointPath( + checkpointDir, RetinaUtils.CHECKPOINT_PREFIX_GC, hostName, ts2); + + Map standard = new HashMap<>(); + CheckpointFileIO.readCheckpointParallel(path1, entry -> + standard.put(entry.fileId + "_" + entry.rgId, + Arrays.copyOf(entry.bitmap, entry.bitmap.length))); + + Map direct = new HashMap<>(); + CheckpointFileIO.readCheckpointParallel(path2, entry -> + direct.put(entry.fileId + "_" + entry.rgId, + Arrays.copyOf(entry.bitmap, entry.bitmap.length))); + + assertEquals("entry count must match", standard.size(), direct.size()); + for (Map.Entry e : standard.entrySet()) + { + long[] directBitmap = direct.get(e.getKey()); + assertNotNull("direct checkpoint must contain key=" + e.getKey(), directBitmap); + assertTrue("bitmaps must be identical for key=" + e.getKey(), + Arrays.equals(e.getValue(), directBitmap)); + } + } + + // ======================================================================= + // Section 7d: concurrent dual-write pressure test + // ======================================================================= + + /** + * Multi-threaded stress test: concurrent {@code deleteRecord} calls with + * dual-write active. Each thread owns one exclusive row group and deletes + * its rows serially within that group, matching the production CDC contract + * (same-RG deletes are serialized; different-RG deletes may run in parallel). + * Verifies that all deletes are correctly propagated in both forward and + * backward directions under inter-RG concurrency. + */ + @Test + public void testDualWrite_concurrentPressure() throws Exception + { + TypeDescription schema = LONG_ID_SCHEMA; + int numRgs = 8; + int rowsPerRg = 8; + long fileId = 50L; + + // Source file with numRgs row groups, rowsPerRg rows each. + // pixelStride must equal rowsPerRg so that each batch produces a full pixel, + // triggering the column writer to flush and making curRowGroupDataLength > 0. + // Without this, pixelStride=10_000 >> rowsPerRg causes outputStream.size()=0, + // the writer never flushes mid-batch, and all rows collapse into a single RG. + String srcPath = writeTestFileMultiRg("dw_conc_src.pxl", schema, numRgs, rowsPerRg, rowsPerRg); + + Map bitmaps = new HashMap<>(); + for (int rgId = 0; rgId < numRgs; rgId++) + { + bitmaps.put(fileId + "_" + rgId, makeBitmapForRows(rowsPerRg)); + retinaManager.addVisibility(fileId, rgId, rowsPerRg, 0L, null, false); + } + + // rowGroupSize=1 byte ensures the rewritten file flushes a new RG after every + // batch (any encoded pixel exceeds 1 byte), preserving the 1:1 old-RG-to-new-RG + // mapping so each thread targets a distinct new RGVisibility object. + StorageGarbageCollector localGc = new StorageGarbageCollector( + retinaManager, metadataService, 0.5, 134_217_728L, + Integer.MAX_VALUE, 10, 1, EncodingLevel.EL2, 86_400_000L); + + StorageGarbageCollector.RewriteResult result = + localGc.rewriteFileGroup(makeGroup(fileId, srcPath, schema), 100L, bitmaps); + long newFileId = result.newFileId; + assertTrue("new file should be registered", newFileId > 0); + assertEquals("rewritten file must have same RG count", numRgs, result.newFileRgCount); + + localGc.registerDualWrite(result); + + // One thread per RG; each thread owns rgId == t and deletes its rows serially. + CyclicBarrier barrier = new CyclicBarrier(numRgs); + AtomicInteger errors = new AtomicInteger(0); + ExecutorService executor = Executors.newFixedThreadPool(numRgs); + + List> futures = new ArrayList<>(); + for (int t = 0; t < numRgs; t++) + { + final int rgId = t; + futures.add(executor.submit(() -> + { + try + { + barrier.await(); + for (int rgOff = 0; rgOff < rowsPerRg; rgOff++) + { + long deleteTs = 200L + rgId * rowsPerRg + rgOff; + if (rgOff % 2 == 0) + { + // Forward direction: delete from old file; dual-write + // propagates to the corresponding new-file row. + retinaManager.deleteRecord(fileId, rgId, rgOff, deleteTs); + } + else + { + // Backward direction: delete from new file; dual-write + // propagates back to the corresponding old-file row. + int[] fwdMapping = result.forwardRgMappings.get(fileId).get(rgId); + int newGlobal = fwdMapping[rgOff]; + if (newGlobal >= 0) + { + int newRgId = RetinaResourceManager.rgIdForGlobalRowOffset( + newGlobal, result.newFileRgRowStart); + int newRgOff = newGlobal - result.newFileRgRowStart[newRgId]; + retinaManager.deleteRecord(newFileId, newRgId, newRgOff, deleteTs); + } + } + } + } + catch (Exception e) + { + errors.incrementAndGet(); + } + })); + } + + for (java.util.concurrent.Future f : futures) + { + f.get(); + } + executor.shutdown(); + + assertEquals("no errors during concurrent deletes", 0, errors.get()); + + long queryTs = 200L + numRgs * rowsPerRg; + + // Verify every row in every old-file RG is deleted. + for (int rgId = 0; rgId < numRgs; rgId++) + { + long[] oldBitmap = retinaManager.queryVisibility(fileId, rgId, queryTs, 0L); + for (int r = 0; r < rowsPerRg; r++) + { + assertTrue("old file rgId=" + rgId + " row " + r + " should be deleted", + isBitSet(oldBitmap, r)); + } + } + + // Verify every corresponding row in the new file is also deleted. + for (int rgId = 0; rgId < numRgs; rgId++) + { + int[] fwdMapping = result.forwardRgMappings.get(fileId).get(rgId); + for (int r = 0; r < rowsPerRg; r++) + { + int newGlobal = fwdMapping[r]; + if (newGlobal >= 0) + { + int newRgId = RetinaResourceManager.rgIdForGlobalRowOffset( + newGlobal, result.newFileRgRowStart); + int newRgOff = newGlobal - result.newFileRgRowStart[newRgId]; + long[] newBitmap = retinaManager.queryVisibility(newFileId, newRgId, queryTs, 0L); + assertTrue("new file rgId=" + newRgId + " row " + newRgOff + + " (from old rgId=" + rgId + " row " + r + ") should be deleted", + (newBitmap[newRgOff / 64] & (1L << (newRgOff % 64))) != 0); + } + } + } + + localGc.unregisterDualWrite(result); + } + + // ======================================================================= + // Section 8: index update + atomic switch + rollback + delayed cleanup + // ======================================================================= + + /** + * Atomicity with multiple old files: one TEMPORARY new file and three REGULAR + * old files are swapped in a single call. Verifies that after the call the new + * file is promoted to REGULAR and all old files are removed from the + * catalog—i.e., the UPDATE and DELETE execute as one indivisible transaction. + */ + @Test + public void testAtomicSwap_multipleOldFilesAtomicity() throws Exception + { + long[] ids = {0, 1}; + long[] ts = {100, 100}; + + writeTestFile("atom_old1.pxl", LONG_ID_SCHEMA, ids, true, ts); + writeTestFile("atom_old2.pxl", LONG_ID_SCHEMA, ids, true, ts); + writeTestFile("atom_old3.pxl", LONG_ID_SCHEMA, ids, true, ts); + + long[] oldIds = registerTestFiles( + new String[]{"atom_old1.pxl", "atom_old2.pxl", "atom_old3.pxl"}, + new File.Type[]{File.Type.REGULAR, File.Type.REGULAR, File.Type.REGULAR}, + new int[]{1, 1, 1}, new long[]{0, 0, 0}, new long[]{1, 1, 1}); + long newFileId = registerTestFile("atom_new.pxl", File.Type.TEMPORARY, 1, 0, 1); + + File preSwapNew = metadataService.getFileById(newFileId); + assertNotNull("New file must exist before swap", preSwapNew); + assertEquals("New file should be TEMPORARY before swap", + File.Type.TEMPORARY, preSwapNew.getType()); + + metadataService.atomicSwapFiles(newFileId, Arrays.asList(oldIds[0], oldIds[1], oldIds[2])); + + assertFileRegular(newFileId, "New file should be REGULAR after swap"); + for (long oldId : oldIds) + { + assertFileGone(oldId, "Old file " + oldId + " should be gone after swap"); + } + } + + /** + * Idempotency: calling {@code atomicSwapFiles} a second time after the swap has + * already committed must not throw. The UPDATE is a no-op (already REGULAR) and + * the DELETE is a no-op (old files already removed). + */ + @Test + public void testAtomicSwap_idempotent() throws Exception + { + writeTestFile("idem_old.pxl", LONG_ID_SCHEMA, new long[]{0, 1, 2}, true, new long[]{100, 100, 100}); + long oldFileId = registerTestFile("idem_old.pxl", File.Type.REGULAR, 1, 0, 2); + long newFileId = registerTestFile("idem_new.pxl", File.Type.TEMPORARY, 1, 0, 2); + + metadataService.atomicSwapFiles(newFileId, Collections.singletonList(oldFileId)); + assertFileRegular(newFileId, "File should be REGULAR after first swap"); + + metadataService.atomicSwapFiles(newFileId, Collections.singletonList(oldFileId)); + + assertFileRegular(newFileId, "File should remain REGULAR after idempotent retry"); + assertFileGone(oldFileId, "Old file should remain absent after idempotent retry"); + } + + /** + * TEMPORARY visibility semantics: before the swap, {@code getFiles(pathId)} must + * not return the TEMPORARY new file (the DAO filters {@code FILE_TYPE <> 0}). + * After the swap the promoted file is visible and the old file disappears. + */ + @Test + public void testAtomicSwap_temporaryInvisibleViaGetFiles() throws Exception + { + writeTestFile("vis_old.pxl", LONG_ID_SCHEMA, new long[]{0, 1}, true, new long[]{100, 100}); + long[] fileIds = registerTestFiles( + new String[]{"vis_old.pxl", "vis_new_temp.pxl"}, + new File.Type[]{File.Type.REGULAR, File.Type.TEMPORARY}, + new int[]{1, 1}, new long[]{0, 0}, new long[]{1, 1}); + long oldFileId = fileIds[0]; + long tempFileId = fileIds[1]; + + List beforeSwap = metadataService.getFiles(testPathId); + Set beforeIds = new HashSet<>(); + for (File f : beforeSwap) + { + beforeIds.add(f.getId()); + } + assertTrue("REGULAR old file should be visible via getFiles before swap", + beforeIds.contains(oldFileId)); + assertFalse("TEMPORARY new file must NOT be visible via getFiles before swap", + beforeIds.contains(tempFileId)); + + metadataService.atomicSwapFiles(tempFileId, Collections.singletonList(oldFileId)); + + List afterSwap = metadataService.getFiles(testPathId); + Set afterIds = new HashSet<>(); + for (File f : afterSwap) + { + afterIds.add(f.getId()); + } + assertTrue("Promoted file should be visible via getFiles after swap", + afterIds.contains(tempFileId)); + assertFalse("Old file should NOT be visible via getFiles after swap", + afterIds.contains(oldFileId)); + } + + /** + * Multiple serial swaps: Storage GC processes FileGroups serially on a single + * thread, so {@code atomicSwapFiles} is never called concurrently in production. + * This test reflects that design: N independent (newFile, oldFile) pairs are + * swapped one after another, and every new file ends up REGULAR while every + * old file is removed. + */ + @Test + public void testAtomicSwap_multipleSerialSwaps() throws Exception + { + TypeDescription schema = LONG_ID_SCHEMA; + long[] ids = {0}; + long[] ts = {100}; + int nPairs = 8; + + long[] newFileIds = new long[nPairs]; + long[] oldFileIds = new long[nPairs]; + + for (int i = 0; i < nPairs; i++) + { + String oldName = "serial_old_" + i + ".pxl"; + String newName = "serial_new_" + i + ".pxl"; + writeTestFile(oldName, schema, ids, true, ts); + + long[] pair = registerTestFiles( + new String[]{oldName, newName}, + new File.Type[]{File.Type.REGULAR, File.Type.TEMPORARY}, + new int[]{1, 1}, new long[]{0, 0}, new long[]{0, 0}); + oldFileIds[i] = pair[0]; + newFileIds[i] = pair[1]; + } + + for (int i = 0; i < nPairs; i++) + { + metadataService.atomicSwapFiles(newFileIds[i], + Collections.singletonList(oldFileIds[i])); + } + + for (int i = 0; i < nPairs; i++) + { + assertFileRegular(newFileIds[i], "Promoted file " + i + " must be REGULAR"); + assertFileGone(oldFileIds[i], "Old file " + i + " should be gone"); + } + } + + /** + * Partial old-files-already-gone: one old file is deleted before the swap, but + * {@code atomicSwapFiles} is called with both IDs. The DELETE-WHERE-IN for an + * already-absent row is a no-op; the transaction must still commit, promoting the + * new file and removing the remaining old file. + */ + @Test + public void testAtomicSwap_partialOldFilesAlreadyGone() throws Exception + { + writeTestFile("partial_old1.pxl", LONG_ID_SCHEMA, new long[]{0, 1}, true, new long[]{100, 100}); + writeTestFile("partial_old2.pxl", LONG_ID_SCHEMA, new long[]{0, 1}, true, new long[]{100, 100}); + + long[] oldIds = registerTestFiles( + new String[]{"partial_old1.pxl", "partial_old2.pxl"}, + new File.Type[]{File.Type.REGULAR, File.Type.REGULAR}, + new int[]{1, 1}, new long[]{0, 0}, new long[]{1, 1}); + + metadataService.deleteFiles(Collections.singletonList(oldIds[0])); + assertFileGone(oldIds[0], "old1 should be gone before swap"); + + long newFileId = registerTestFile("partial_new.pxl", File.Type.TEMPORARY, 1, 0, 1); + metadataService.atomicSwapFiles(newFileId, Arrays.asList(oldIds[0], oldIds[1])); + + assertFileRegular(newFileId, "New file must be REGULAR"); + assertFileGone(oldIds[1], "Remaining old file should be gone"); + } + + /** + * Rollback after rewrite + dual-write: verifies that Visibility entries for the new + * file are removed, dual-write is unregistered, the TEMPORARY catalog entry is deleted, + * and the physical file is cleaned up. + */ + @Test + public void testAtomicSwap_rollbackCleansUp() throws Exception + { + long[] ids = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + long[] ts = new long[10]; + Arrays.fill(ts, 100); + String filePath = writeTestFile("rollback_src.pxl", LONG_ID_SCHEMA, ids, true, ts); + long srcFileId = registerTestFile("rollback_src.pxl", File.Type.REGULAR, 1, 0, 9); + + StorageGarbageCollector.FileGroup group = makeGroup(srcFileId, filePath, LONG_ID_SCHEMA); + + Map bitmaps = new HashMap<>(); + bitmaps.put(RetinaUtils.buildRgKey(srcFileId, 0), makeBitmap(10, 6)); + + retinaManager.addVisibility(srcFileId, 0, 10, 50, null, true); + + StorageGarbageCollector.RewriteResult result = gc.rewriteFileGroup(group, 100, bitmaps); + assertTrue("New file should be created", result.newFileId > 0); + + gc.registerDualWrite(result); + + gc.rollback(result); + + assertFalse("New file should be deleted after rollback", + fileStorage.exists(result.newFilePath)); + + assertFileGone(result.newFileId, "Catalog entry should be deleted after rollback"); + } + + /** Delayed cleanup removes old file Visibility and physical file after wall-clock deadline passes. */ + @Test + public void testAtomicSwap_delayedCleanup() throws Exception + { + TypeDescription schema = LONG_ID_SCHEMA; + long[] ids = {0, 1, 2, 3, 4}; + long[] ts = new long[5]; + Arrays.fill(ts, 100); + String filePath = writeTestFile("delayed_old.pxl", schema, ids, true, ts); + long fakeFileId = 999999L; + + retinaManager.addVisibility(fakeFileId, 0, 5, 50, null, true); + + long futureDeadline = System.currentTimeMillis() + 60_000L; + RetinaResourceManager.RetiredFile retiredFuture = new RetinaResourceManager.RetiredFile( + fakeFileId, 1, filePath, futureDeadline, Collections.emptyList()); + retinaManager.scheduleRetiredFile(retiredFuture); + + retinaManager.processRetiredFiles(); + assertTrue("File should NOT be cleaned before deadline", + fileStorage.exists(filePath)); + + resetManagerState(); + + retinaManager.addVisibility(fakeFileId, 0, 5, 50, null, true); + + long pastDeadline = System.currentTimeMillis() - 1L; + RetinaResourceManager.RetiredFile retiredPast = new RetinaResourceManager.RetiredFile( + fakeFileId, 1, filePath, pastDeadline, Collections.emptyList()); + retinaManager.scheduleRetiredFile(retiredPast); + + retinaManager.processRetiredFiles(); + assertFalse("File should be cleaned after deadline", + fileStorage.exists(filePath)); + } + + // ======================================================================= + // Section 9: end-to-end integration tests (placeholder) + // ======================================================================= + + /** + * Comprehensive end-to-end test covering the full Storage GC lifecycle with + * real deletion chains. Exercises every step of the pipeline: + * + *

    +     * Phase 1 (ts ≤ safeGcTs=100): delete 6 rows → physically removed by rewrite
    +     * Phase 2 (ts=150, before dual-write): delete row 1 → only in old chain, needs export
    +     * Rewrite → verify data, forward/backward mappings, hidden column preservation
    +     * Register dual-write
    +     * Phase 3 (ts=200, dual-write active): delete row 3 → propagated to both files
    +     * Sync visibility → export + coord-transform + import
    +     * Phase 4 (ts=300, post-sync, dual-write still active): delete row 5
    +     * Commit → atomic swap (TEMPORARY→REGULAR), old file removed from catalog
    +     * Verify: multi-snap_ts consistency on new file at ts=100..500
    +     * Verify: old file gone from catalog, new file REGULAR
    +     * 
    + */ + @Test + public void testEndToEnd_fullGcCycle() throws Exception + { + int numRows = 10; + long[] ids = new long[numRows]; + long[] createTs = new long[numRows]; + for (int i = 0; i < numRows; i++) + { + ids[i] = i * 10; + createTs[i] = 50L; + } + String srcPath = writeTestFile("e2e_full_src.pxl", LONG_ID_SCHEMA, ids, true, createTs); + long srcFileId = registerTestFile("e2e_full_src.pxl", File.Type.REGULAR, 1, 0, numRows - 1); + + retinaManager.addVisibility(srcFileId, 0, numRows, 0L, null, false); + + long safeGcTs = 100L; + retinaManager.deleteRecord(srcFileId, 0, 0, 10L); + retinaManager.deleteRecord(srcFileId, 0, 2, 20L); + retinaManager.deleteRecord(srcFileId, 0, 4, 30L); + retinaManager.deleteRecord(srcFileId, 0, 6, 50L); + retinaManager.deleteRecord(srcFileId, 0, 8, 70L); + retinaManager.deleteRecord(srcFileId, 0, 9, 90L); + + retinaManager.deleteRecord(srcFileId, 0, 1, 150L); + + long[] gcBitmap = retinaManager.queryVisibility(srcFileId, 0, safeGcTs, 0L); + Map bitmaps = new HashMap<>(); + bitmaps.put(RetinaUtils.buildRgKey(srcFileId, 0), gcBitmap); + + for (int r : new int[]{0, 2, 4, 6, 8, 9}) + { + assertTrue("row " + r + " should be in GC bitmap", isBitSet(gcBitmap, r)); + } + for (int r : new int[]{1, 3, 5, 7}) + { + assertFalse("row " + r + " should NOT be in GC bitmap", isBitSet(gcBitmap, r)); + } + + NoIndexSyncGC e2eGc = new NoIndexSyncGC(retinaManager, metadataService, + 0.5, 134_217_728L, Integer.MAX_VALUE, 10, 1048576, + EncodingLevel.EL2, 86_400_000L); + + StorageGarbageCollector.FileGroup group = makeGroup(srcFileId, srcPath, LONG_ID_SCHEMA); + + StorageGarbageCollector.RewriteResult result = + e2eGc.rewriteFileGroup(group, safeGcTs, bitmaps); + long newFileId = result.newFileId; + assertTrue("new file must be created", newFileId > 0); + assertRewriteResultConsistency(result, 4); + + long[][] rows = readAllRows(result.newFilePath, LONG_ID_SCHEMA, true); + assertEquals("4 survivors expected (rows 1,3,5,7)", 4, rows.length); + long[] expectedIds = {10L, 30L, 50L, 70L}; + for (int i = 0; i < 4; i++) + { + assertEquals("id mismatch at new row " + i, expectedIds[i], rows[i][0]); + assertEquals("create_ts mismatch at new row " + i, 50L, rows[i][1]); + } + + int[] fwd = result.forwardRgMappings.get(srcFileId).get(0); + assertEquals(-1, fwd[0]); + assertEquals(0, fwd[1]); + assertEquals(-1, fwd[2]); + assertEquals(1, fwd[3]); + assertEquals(-1, fwd[4]); + assertEquals(2, fwd[5]); + assertEquals(-1, fwd[6]); + assertEquals(3, fwd[7]); + assertEquals(-1, fwd[8]); + assertEquals(-1, fwd[9]); + + assertEquals(1, result.backwardInfos.size()); + StorageGarbageCollector.BackwardInfo bwd = result.backwardInfos.get(0); + assertEquals(srcFileId, bwd.oldFileId); + + e2eGc.registerDualWrite(result); + + retinaManager.deleteRecord(srcFileId, 0, 3, 200L); + int newRowForOld3 = fwd[3]; + assertTrue("fwd[3] should be valid", newRowForOld3 >= 0); + long[] dualBm = retinaManager.queryVisibility(newFileId, 0, 200L, 0L); + assertTrue("dual-write: new row " + newRowForOld3 + " should be deleted", + isBitSet(dualBm, newRowForOld3)); + + e2eGc.syncVisibility(result, safeGcTs); + + int newRowForOld1 = fwd[1]; + long[] syncBm = retinaManager.queryVisibility(newFileId, 0, 150L, 0L); + assertTrue("sync: new row " + newRowForOld1 + " should show old row 1 deleted at ts=150", + isBitSet(syncBm, newRowForOld1)); + + retinaManager.deleteRecord(srcFileId, 0, 5, 300L); + int newRowForOld5 = fwd[5]; + assertTrue("fwd[5] should be valid", newRowForOld5 >= 0); + + e2eGc.syncIndex(result, group.tableId); + e2eGc.commitFileGroup(result); + + assertFileRegular(newFileId, "new file should be REGULAR after commit"); + assertFileGone(srcFileId, "old file should be gone from catalog after commit"); + + assertTrue("old physical file should still exist (delayed cleanup, not yet due)", + fileStorage.exists(srcPath)); + + for (long snap : new long[]{100L, 149L, 150L, 199L, 200L, 299L, 300L, 500L}) + { + long[] bm = retinaManager.queryVisibility(newFileId, 0, snap, 0L); + assertEquals("snap=" + snap + " newRow0 (old row1, del@150)", snap >= 150, isBitSet(bm, 0)); + assertEquals("snap=" + snap + " newRow1 (old row3, del@200)", snap >= 200, isBitSet(bm, 1)); + assertEquals("snap=" + snap + " newRow2 (old row5, del@300)", snap >= 300, isBitSet(bm, 2)); + assertFalse("snap=" + snap + " newRow3 (old row7) should never be deleted", isBitSet(bm, 3)); + } + + for (long snap : new long[]{100L, 150L, 200L, 300L, 500L}) + { + long[] oldBm = retinaManager.queryVisibility(srcFileId, 0, snap, 0L); + long[] newBm = retinaManager.queryVisibility(newFileId, 0, snap, 0L); + for (int oldRow = 1; oldRow <= 7; oldRow += 2) + { + int newRow = fwd[oldRow]; + assertTrue("old row " + oldRow + " should have valid mapping", newRow >= 0); + assertEquals("snap=" + snap + " old row " + oldRow + " vs new row " + newRow + + " visibility mismatch", isBitSet(oldBm, oldRow), isBitSet(newBm, newRow)); + } + } + } + + /** Inject failure after visibility sync → rollback clean → re-run succeeds. */ + @Ignore("rollback end-to-end not yet implemented") + @Test + public void testEndToEnd_rollbackOnVisibilitySyncFailure() throws Exception + { + } + + /** WAL crash recovery from each state → correct resume or rollback. */ + @Ignore("crash recovery not yet implemented") + @Test + public void testEndToEnd_crashRecovery() throws Exception + { + } + + /** + * Concurrent INSERT/DELETE/UPDATE + GC → all operations correct. + * + *

    Simulates a realistic CDC event stream (serial per virtualNodeId) running + * concurrently with a full Storage GC pipeline (S2→S6). CDC events include + * DELETE, INSERT (new file + addVisibility), and UPDATE (deleteRecord + INSERT). + * Events are spread across the entire GC execution window with short sleep + * intervals to maximize interleaving with every GC phase. + * + *

    +     * Phase 0: 30 rows, delete 18 (ts 5..90 < safeGcTs=100), 12 survivors (rows 18-29)
    +     * Phase 1: CDC thread + GC thread start concurrently via CyclicBarrier
    +     *   CDC: DELETE/INSERT/UPDATE events at ts=150..550, interleaving with GC
    +     *   GC:  rewrite → registerDualWrite → syncVisibility → syncIndex(stub) → commit
    +     * Phase 2: post-GC CDC deletes rows 27-29 at ts=600..700 (dual-write off)
    +     * Phase 3: Verification — multi-snap_ts consistency, catalog, data, no errors
    +     * 
    + */ + @Test + public void testEndToEnd_concurrentCdcAndGc() throws Exception + { + TypeDescription schema = LONG_ID_SCHEMA; + int numRows = 30; + long safeGcTs = 100L; + + // ── Phase 0: Setup source file ────────────────────────────────────── + long[] ids = new long[numRows]; + long[] createTs = new long[numRows]; + for (int i = 0; i < numRows; i++) + { + ids[i] = i * 10; + createTs[i] = 50L; + } + String srcPath = writeTestFile("conc_cdc_gc.pxl", LONG_ID_SCHEMA, ids, true, createTs); + long srcFileId = registerTestFile("conc_cdc_gc.pxl", File.Type.REGULAR, 1, 0, numRows - 1); + + retinaManager.addVisibility(srcFileId, 0, numRows, 0L, null, false); + + int deletedBefore = 18; + for (int i = 0; i < deletedBefore; i++) + { + retinaManager.deleteRecord(srcFileId, 0, i, 5L + i * 5L); + } + + long[] gcBitmap = retinaManager.queryVisibility(srcFileId, 0, safeGcTs, 0L); + Map bitmaps = new HashMap<>(); + bitmaps.put(RetinaUtils.buildRgKey(srcFileId, 0), gcBitmap); + + for (int r = 0; r < deletedBefore; r++) + { + assertTrue("row " + r + " must be in GC bitmap", isBitSet(gcBitmap, r)); + } + int survivors = numRows - deletedBefore; + for (int r = deletedBefore; r < numRows; r++) + { + assertFalse("row " + r + " must NOT be in GC bitmap", isBitSet(gcBitmap, r)); + } + + // ── Prepare GC and concurrency primitives ─────────────────────────── + NoIndexSyncGC concGc = new NoIndexSyncGC(retinaManager, metadataService, + 0.5, 134_217_728L, Integer.MAX_VALUE, 10, 1048576, + EncodingLevel.EL2, 86_400_000L); + + StorageGarbageCollector.FileGroup group = makeGroup(srcFileId, srcPath, LONG_ID_SCHEMA); + + CyclicBarrier barrier = new CyclicBarrier(2); + java.util.concurrent.CountDownLatch dualWriteActive = new java.util.concurrent.CountDownLatch(1); + java.util.concurrent.CountDownLatch cdcPhaseBDone = new java.util.concurrent.CountDownLatch(1); + java.util.concurrent.CountDownLatch gcDone = new java.util.concurrent.CountDownLatch(1); + AtomicInteger errors = new AtomicInteger(0); + java.util.concurrent.atomic.AtomicReference resultRef = + new java.util.concurrent.atomic.AtomicReference<>(null); + java.util.concurrent.atomic.AtomicReference gcError = + new java.util.concurrent.atomic.AtomicReference<>(null); + java.util.concurrent.atomic.AtomicReference cdcError = + new java.util.concurrent.atomic.AtomicReference<>(null); + + List insertedFileIds = Collections.synchronizedList(new ArrayList<>()); + + // ── Phase 1: Launch CDC + GC threads concurrently ─────────────────── + + // CDC thread: serial events on the same (table, virtualNodeId). + // Phase A runs during S2 rewrite (pre-dual-write) → captured by export. + // Phase B runs during dual-write window (S3-S6) → forwarded by dual-write. + // Phase C runs after GC commit → only reaches old file. + Thread cdcThread = new Thread(() -> + { + try + { + barrier.await(); + + // Phase A + B wrapped in try-finally so that cdcPhaseBDone + // is always signalled, even if an exception occurs. + try + { + // ── Phase A: pre-dual-write deletes (during S2 rewrite) ─ + // These go only to the old file's chain; syncVisibility + // export will capture them (ts > safeGcTs). + retinaManager.deleteRecord(srcFileId, 0, 18, 150L); + Thread.sleep(2); + retinaManager.deleteRecord(srcFileId, 0, 19, 200L); + Thread.sleep(2); + retinaManager.deleteRecord(srcFileId, 0, 20, 250L); + + // Wait until GC has registered dual-write + dualWriteActive.await(); + + // ── Phase B: dual-write window (S3→S6) ────────────────── + // Deletes are forwarded to both old and new file. + // INSERT/UPDATE interleaved to test concurrent addVisibility. + retinaManager.deleteRecord(srcFileId, 0, 21, 300L); + Thread.sleep(5); + + // INSERT: new file cdc_ins_1.pxl (2 rows) + long[] insIds1 = {1000L, 1001L}; + long[] insTs1 = {300L, 300L}; + writeTestFile("cdc_ins_1.pxl", schema, insIds1, true, insTs1); + long insFileId1 = 9001L; + retinaManager.addVisibility(insFileId1, 0, 2, 0L, null, false); + insertedFileIds.add(insFileId1); + + retinaManager.deleteRecord(srcFileId, 0, 22, 350L); + Thread.sleep(5); + + // UPDATE row 23 @ ts=400: delete old + insert new + retinaManager.deleteRecord(srcFileId, 0, 23, 400L); + long[] updIds1 = {2000L}; + long[] updTs1 = {400L}; + writeTestFile("cdc_upd_1.pxl", schema, updIds1, true, updTs1); + long updFileId1 = 9002L; + retinaManager.addVisibility(updFileId1, 0, 1, 0L, null, false); + insertedFileIds.add(updFileId1); + + retinaManager.deleteRecord(srcFileId, 0, 24, 450L); + Thread.sleep(5); + + retinaManager.deleteRecord(srcFileId, 0, 25, 500L); + Thread.sleep(5); + + // INSERT: new file cdc_ins_2.pxl (2 rows) + long[] insIds2 = {1002L, 1003L}; + long[] insTs2 = {500L, 500L}; + writeTestFile("cdc_ins_2.pxl", schema, insIds2, true, insTs2); + long insFileId2 = 9003L; + retinaManager.addVisibility(insFileId2, 0, 2, 0L, null, false); + insertedFileIds.add(insFileId2); + + retinaManager.deleteRecord(srcFileId, 0, 26, 550L); + } + finally + { + cdcPhaseBDone.countDown(); + } + + // ── Phase C: Wait for GC commit, then post-GC deletes ─────── + gcDone.await(); + + // Dual-write is now off; these only go to old file + retinaManager.deleteRecord(srcFileId, 0, 27, 600L); + Thread.sleep(5); + retinaManager.deleteRecord(srcFileId, 0, 28, 650L); + Thread.sleep(5); + retinaManager.deleteRecord(srcFileId, 0, 29, 700L); + } + catch (Throwable t) + { + cdcError.set(t); + errors.incrementAndGet(); + } + }); + + // GC thread: full pipeline + Thread gcThread = new Thread(() -> + { + try + { + barrier.await(); + + StorageGarbageCollector.RewriteResult result = + concGc.rewriteFileGroup(group, safeGcTs, bitmaps); + assertTrue("new file must be created", result.newFileId > 0); + + concGc.registerDualWrite(result); + dualWriteActive.countDown(); + + concGc.syncVisibility(result, safeGcTs); + concGc.syncIndex(result, group.tableId); + + cdcPhaseBDone.await(); + concGc.commitFileGroup(result); + + resultRef.set(result); + } + catch (Throwable t) + { + gcError.set(t); + errors.incrementAndGet(); + dualWriteActive.countDown(); + } + finally + { + gcDone.countDown(); + } + }); + + cdcThread.start(); + gcThread.start(); + + cdcThread.join(30_000); + gcThread.join(30_000); + + if (gcError.get() != null) + { + throw new AssertionError("GC thread failed", gcError.get()); + } + if (cdcError.get() != null) + { + throw new AssertionError("CDC thread failed", cdcError.get()); + } + + assertFalse("CDC thread should have finished", cdcThread.isAlive()); + assertFalse("GC thread should have finished", gcThread.isAlive()); + assertEquals("no errors during concurrent execution", 0, errors.get()); + + // ── Phase 3: Verification ─────────────────────────────────────────── + StorageGarbageCollector.RewriteResult result = resultRef.get(); + assertNotNull("RewriteResult must be available", result); + long newFileId = result.newFileId; + assertTrue("new file must have a valid id", newFileId > 0); + + // 3a. Verify new file data: 12 survivors (rows 18-29) + assertRewriteResultConsistency(result, survivors); + long[][] rows = readAllRows(result.newFilePath, schema, true); + assertEquals("12 survivors expected", survivors, rows.length); + for (int i = 0; i < survivors; i++) + { + long expectedId = (deletedBefore + i) * 10L; + assertEquals("id mismatch at new row " + i, expectedId, rows[i][0]); + assertEquals("create_ts mismatch at new row " + i, 50L, rows[i][1]); + } + + // 3b. Verify catalog state + assertFileRegular(newFileId, "new file should be REGULAR"); + assertFileGone(srcFileId, "old file should be gone from catalog"); + + // 3c. Forward mapping + int[] fwd = result.forwardRgMappings.get(srcFileId).get(0); + for (int r = 0; r < deletedBefore; r++) + { + assertEquals("deleted row " + r + " should map to -1", -1, fwd[r]); + } + for (int r = deletedBefore; r < numRows; r++) + { + assertTrue("surviving row " + r + " should have valid mapping", + fwd[r] >= 0); + } + + // 3d. Multi-snap_ts visibility consistency for rows in dual-write window. + // During dual-write, deletes on old file are forwarded to new file, + // and export+import syncs pre-dual-write deletes. So for any snap_ts, + // old and new visibility must agree for rows that were deleted BEFORE + // dual-write was unregistered (i.e., before commit). + // CDC deletes on rows 18-26 have ts=150..550, all happen before/during GC. + for (long snapTs : new long[]{100L, 150L, 200L, 250L, 300L, 350L, 400L, + 450L, 500L, 550L, 800L, 1000L}) + { + long[] oldBm = retinaManager.queryVisibility(srcFileId, 0, snapTs, 0L); + for (int oldRow = deletedBefore; oldRow < numRows; oldRow++) + { + int newGlobal = fwd[oldRow]; + assertTrue("old row " + oldRow + " must have valid fwd mapping", + newGlobal >= 0); + int newRgId = RetinaResourceManager.rgIdForGlobalRowOffset( + newGlobal, result.newFileRgRowStart); + int newRgOff = newGlobal - result.newFileRgRowStart[newRgId]; + long[] newBm = retinaManager.queryVisibility(newFileId, newRgId, snapTs, 0L); + + boolean oldDel = isBitSet(oldBm, oldRow); + boolean newDel = isBitSet(newBm, newRgOff); + + if (oldRow <= 26) + { + // Rows 18-26: deleted during dual-write window → must be consistent + assertEquals("snap_ts=" + snapTs + " oldRow=" + oldRow + + " newRgOff=" + newRgOff + ": visibility mismatch", + oldDel, newDel); + } + else + { + // Rows 27-29: deleted after GC commit (dual-write off). + // Old file shows them as deleted for snap_ts >= their delete_ts; + // new file should NOT reflect these post-GC deletes. + assertFalse("snap_ts=" + snapTs + " newRow for oldRow=" + oldRow + + " should NOT be deleted (post-GC delete)", + newDel); + } + } + } + + // 3e. Verify INSERT files are unaffected by GC + for (long insFileId : insertedFileIds) + { + long[] insBm = retinaManager.queryVisibility(insFileId, 0, 1000L, 0L); + assertNotNull("INSERT file " + insFileId + " visibility should exist", insBm); + for (int w = 0; w < insBm.length; w++) + { + assertEquals("INSERT file " + insFileId + " should have no deletes", + 0L, insBm[w]); + } + } + } + + // ======================================================================= + // E2E: Multi-round CDC + GC lifecycle + // ======================================================================= + + /** + * Simulates a realistic multi-round CDC + GC lifecycle with three GC rounds + * interleaved with serial CDC operations (INSERT, UPDATE, DELETE). + * + *

    In production: + *

      + *
    • CDC operations on a given (table, vnode) are serial (one at a time).
    • + *
    • GC runs periodically via {@code scheduleAtFixedRate}, serial.
    • + *
    • Between GC rounds, more CDC events arrive and accumulate deletions.
    • + *
    + * + *
    +     * CDC epoch 1 (ts 10~90):
    +     *   INSERT file-A (20 rows, create_ts=5), DELETE 10 rows (0,2,4,...,18)
    +     * GC round 1 (safeGcTs=100):
    +     *   Memory GC → bitmap → rewrite file-A → file-A'
    +     *   file-A enters retired queue (retireDelayMs=0 → eligible immediately)
    +     *
    +     * CDC epoch 2 (ts 150~250):
    +     *   INSERT file-B (10 rows, create_ts=120)
    +     *   DELETE 5 rows in file-A' (new rows 0,2,4,6,8 → old rows 1,5,9,13,17)
    +     *   UPDATE row in file-A': delete new-row-1 + insert file-C (1 row)
    +     * GC round 2 (safeGcTs=300):
    +     *   processRetiredFiles → clean file-A physical file
    +     *   Memory GC → bitmap → rewrite file-A' → file-A''
    +     *   file-A' enters retired queue
    +     *
    +     * CDC epoch 3 (ts 350~400):
    +     *   DELETE 3 rows in file-B (rows 0,1,2)
    +     *   DELETE 2 rows in file-A'' (new rows 0,1)
    +     * GC round 3 (safeGcTs=450):
    +     *   processRetiredFiles → clean file-A'
    +     *   Memory GC → bitmap for file-A'', file-B, file-C
    +     *   rewrite file-B → file-B' (file-A'' and file-C below threshold)
    +     *
    +     * Final verification:
    +     *   - all surviving data readable with correct id/create_ts
    +     *   - visibility consistent across multiple snap_ts
    +     *   - catalog: only latest generation files exist as REGULAR
    +     *   - physical files from round 1 and 2 cleaned up
    +     * 
    + */ + @Test + public void testEndToEnd_multiRoundCdcGcLifecycle() throws Exception + { + TypeDescription schema = LONG_ID_SCHEMA; + + // ── CDC epoch 1 ───────────────────────────────────────────────────── + // INSERT file-A: 20 rows (id=0,10,20,...,190), create_ts=5 + int numRowsA = 20; + long[] idsA = new long[numRowsA]; + long[] tsA = new long[numRowsA]; + for (int i = 0; i < numRowsA; i++) + { + idsA[i] = i * 10; + tsA[i] = 5L; + } + String pathA = writeTestFile("lifecycle_a.pxl", schema, idsA, true, tsA); + long fileIdA = registerTestFile("lifecycle_a.pxl", File.Type.REGULAR, 1, 0, numRowsA - 1); + + retinaManager.addVisibility(fileIdA, 0, numRowsA, 0L, null, false); + + // DELETE 10 even-indexed rows in file-A (rows 0,2,4,...,18) at ts=10..90 + for (int i = 0; i < 10; i++) + { + retinaManager.deleteRecord(fileIdA, 0, i * 2, 10L + i * 9L); + } + // Survivors in file-A at safeGcTs=100: odd rows 1,3,5,...,19 → ids 10,30,50,...,190 + + // ── GC round 1 (safeGcTs=100) ────────────────────────────────────── + long safeGcTs1 = 100L; + NoIndexSyncGC gcR1 = new NoIndexSyncGC(retinaManager, metadataService, + 0.3, 134_217_728L, Integer.MAX_VALUE, 10, 1048576, + EncodingLevel.EL2, 0L); + + // Step 1: Memory GC → gcSnapshotBitmap + Map bitmaps1 = new HashMap<>(); + Map rgMap1 = getRgVisibilityMap(); + for (Map.Entry entry : rgMap1.entrySet()) + { + long[] bitmap = entry.getValue().garbageCollect(safeGcTs1); + bitmaps1.put(entry.getKey(), bitmap); + } + + // Step 2: Storage GC → rewrite file-A → file-A' + StorageGarbageCollector.FileGroup groupA = + makeGroup(fileIdA, pathA, schema); + StorageGarbageCollector.RewriteResult resultR1 = + gcR1.rewriteFileGroup(groupA, safeGcTs1, bitmaps1); + long fileIdAprime = resultR1.newFileId; + assertTrue("GC round 1 must create new file", fileIdAprime > 0); + + long[][] rowsAprime = readAllRows(resultR1.newFilePath, schema, true); + assertEquals("10 survivors after round 1", 10, rowsAprime.length); + for (int i = 0; i < 10; i++) + { + assertEquals("round 1: id mismatch at new row " + i, + (2 * i + 1) * 10L, rowsAprime[i][0]); + assertEquals("round 1: create_ts preserved", + 5L, rowsAprime[i][1]); + } + + gcR1.registerDualWrite(resultR1); + gcR1.syncVisibility(resultR1, safeGcTs1); + gcR1.syncIndex(resultR1, groupA.tableId); + gcR1.commitFileGroup(resultR1); + + assertFileRegular(fileIdAprime, "file-A' must be REGULAR after commit"); + + String pathAprime = resultR1.newFilePath; + assertTrue("file-A physical should still exist (in retired queue)", fileStorage.exists(pathA)); + + // ── CDC epoch 2 (ts 150~250) ──────────────────────────────────────── + // INSERT file-B: 10 rows (id=1000..1009), create_ts=120 + int numRowsB = 10; + long[] idsB = new long[numRowsB]; + long[] tsB = new long[numRowsB]; + for (int i = 0; i < numRowsB; i++) + { + idsB[i] = 1000 + i; + tsB[i] = 120L; + } + writeTestFile("lifecycle_b.pxl", schema, idsB, true, tsB); + long fileIdB = registerTestFile("lifecycle_b.pxl", File.Type.REGULAR, 1, 0, numRowsB - 1); + + retinaManager.addVisibility(fileIdB, 0, numRowsB, 0L, null, false); + + // DELETE 5 rows in file-A' (new rows 0,2,4,6,8) at ts=150~190 + for (int i = 0; i < 5; i++) + { + retinaManager.deleteRecord(fileIdAprime, 0, i * 2, 150L + i * 10L); + } + + // UPDATE: delete new-row-1 in file-A' at ts=200, insert file-C (1 row) + retinaManager.deleteRecord(fileIdAprime, 0, 1, 200L); + + long[] idsC = {9999L}; + long[] tsC = {200L}; + writeTestFile("lifecycle_c.pxl", schema, idsC, true, tsC); + long fileIdC = registerTestFile("lifecycle_c.pxl", File.Type.REGULAR, 1, 0, 0); + + retinaManager.addVisibility(fileIdC, 0, 1, 0L, null, false); + + // ── GC round 2 (safeGcTs=300) ────────────────────────────────────── + long safeGcTs2 = 300L; + NoIndexSyncGC gcR2 = new NoIndexSyncGC(retinaManager, metadataService, + 0.3, 134_217_728L, Integer.MAX_VALUE, 10, 1048576, + EncodingLevel.EL2, 0L); + + // processRetiredFiles → file-A should be cleaned (retireDelayMs=0) + retinaManager.processRetiredFiles(); + assertFalse("file-A physical should be cleaned after processRetiredFiles", + fileStorage.exists(pathA)); + + // Step 1: Memory GC + Map bitmaps2 = new HashMap<>(); + Map fileStats2 = new HashMap<>(); + Map rgMap2 = getRgVisibilityMap(); + for (Map.Entry entry : rgMap2.entrySet()) + { + String rgKey = entry.getKey(); + long fid = RetinaUtils.parseFileIdFromRgKey(rgKey); + long[] bitmap = entry.getValue().garbageCollect(safeGcTs2); + bitmaps2.put(rgKey, bitmap); + + long recordNum = entry.getValue().getRecordNum(); + long invalidCount = 0; + for (long word : bitmap) + { + invalidCount += Long.bitCount(word); + } + final long ic = invalidCount; + final long rn = recordNum; + fileStats2.compute(fid, (k, existing) -> { + if (existing == null) return new long[]{rn, ic}; + existing[0] += rn; + existing[1] += ic; + return existing; + }); + } + + // file-A' has 6/10 deleted (60% > 30% threshold) → eligible + long[] statsAprime = fileStats2.get(fileIdAprime); + assertNotNull("file-A' stats must exist", statsAprime); + assertTrue("file-A' invalidRatio > threshold for round 2", + (double) statsAprime[1] / statsAprime[0] > 0.3); + + // Step 2: rewrite file-A' → file-A'' + StorageGarbageCollector.FileGroup groupAprime = + makeGroup(fileIdAprime, pathAprime, schema); + StorageGarbageCollector.RewriteResult resultR2 = + gcR2.rewriteFileGroup(groupAprime, safeGcTs2, bitmaps2); + long fileIdAdoubleprime = resultR2.newFileId; + assertTrue("GC round 2 must create new file", fileIdAdoubleprime > 0); + + // Verify survivors in file-A'': original odd rows minus those deleted in epoch 2 + // Deleted in file-A': new rows 0,1,2,4,6,8 → survivors: new rows 3,5,7,9 + // new row 3 = old row 7 (id=70), new row 5 = old row 11 (id=110), + // new row 7 = old row 15 (id=150), new row 9 = old row 19 (id=190) + long[][] rowsAdp = readAllRows(resultR2.newFilePath, schema, true); + assertEquals("4 survivors after round 2", 4, rowsAdp.length); + long[] expectedIdsR2 = {70L, 110L, 150L, 190L}; + for (int i = 0; i < 4; i++) + { + assertEquals("round 2: id mismatch at row " + i, + expectedIdsR2[i], rowsAdp[i][0]); + assertEquals("round 2: create_ts preserved", 5L, rowsAdp[i][1]); + } + + gcR2.registerDualWrite(resultR2); + gcR2.syncVisibility(resultR2, safeGcTs2); + gcR2.syncIndex(resultR2, groupAprime.tableId); + gcR2.commitFileGroup(resultR2); + + assertFileRegular(fileIdAdoubleprime, "file-A'' must be REGULAR after commit"); + + String pathAdoubleprime = resultR2.newFilePath; + + // ── CDC epoch 3 (ts 350~400) ──────────────────────────────────────── + // DELETE 3 rows in file-B (rows 0,1,2) at ts=350..370 + for (int i = 0; i < 3; i++) + { + retinaManager.deleteRecord(fileIdB, 0, i, 350L + i * 10L); + } + + // DELETE 2 rows in file-A'' (new rows 0,1) at ts=380,390 + retinaManager.deleteRecord(fileIdAdoubleprime, 0, 0, 380L); + retinaManager.deleteRecord(fileIdAdoubleprime, 0, 1, 390L); + + // ── GC round 3 (safeGcTs=450) ────────────────────────────────────── + long safeGcTs3 = 450L; + NoIndexSyncGC gcR3 = new NoIndexSyncGC(retinaManager, metadataService, + 0.3, 134_217_728L, Integer.MAX_VALUE, 10, 1048576, + EncodingLevel.EL2, 0L); + + // processRetiredFiles → file-A' should be cleaned + retinaManager.processRetiredFiles(); + assertFalse("file-A' physical should be cleaned after round 3 processRetiredFiles", + fileStorage.exists(pathAprime)); + + // Step 1: Memory GC + Map bitmaps3 = new HashMap<>(); + Map fileStats3 = new HashMap<>(); + Map rgMap3 = getRgVisibilityMap(); + for (Map.Entry entry : rgMap3.entrySet()) + { + String rgKey = entry.getKey(); + long fid = RetinaUtils.parseFileIdFromRgKey(rgKey); + long[] bitmap = entry.getValue().garbageCollect(safeGcTs3); + bitmaps3.put(rgKey, bitmap); + + long recordNum = entry.getValue().getRecordNum(); + long invalidCount = 0; + for (long word : bitmap) + { + invalidCount += Long.bitCount(word); + } + final long ic = invalidCount; + final long rn = recordNum; + fileStats3.compute(fid, (k, existing) -> { + if (existing == null) return new long[]{rn, ic}; + existing[0] += rn; + existing[1] += ic; + return existing; + }); + } + + // file-B: 3/10 deleted = 30%. With threshold 0.3, need > 30% so NOT eligible. + // file-A'': 2/4 deleted = 50% > 30% → eligible + long[] statsB = fileStats3.get(fileIdB); + assertNotNull("file-B stats must exist", statsB); + + long[] statsAdp = fileStats3.get(fileIdAdoubleprime); + assertNotNull("file-A'' stats must exist", statsAdp); + assertTrue("file-A'' invalidRatio > threshold for round 3", + (double) statsAdp[1] / statsAdp[0] > 0.3); + + // Rewrite file-A'' → file-A''' + StorageGarbageCollector.FileGroup groupAdp = + makeGroup(fileIdAdoubleprime, pathAdoubleprime, schema); + StorageGarbageCollector.RewriteResult resultR3 = + gcR3.rewriteFileGroup(groupAdp, safeGcTs3, bitmaps3); + long fileIdAtriple = resultR3.newFileId; + assertTrue("GC round 3 must create new file", fileIdAtriple > 0); + + // file-A''' survivors: original file-A'' rows 2,3 → ids 150, 190 + long[][] rowsAtriple = readAllRows(resultR3.newFilePath, schema, true); + assertEquals("2 survivors after round 3", 2, rowsAtriple.length); + assertEquals("round 3: id[0]", 150L, rowsAtriple[0][0]); + assertEquals("round 3: id[1]", 190L, rowsAtriple[1][0]); + assertEquals("round 3: create_ts[0]", 5L, rowsAtriple[0][1]); + assertEquals("round 3: create_ts[1]", 5L, rowsAtriple[1][1]); + + gcR3.registerDualWrite(resultR3); + gcR3.syncVisibility(resultR3, safeGcTs3); + gcR3.syncIndex(resultR3, groupAdp.tableId); + gcR3.commitFileGroup(resultR3); + + // ── Final verification ────────────────────────────────────────────── + + // Catalog: latest generation files are REGULAR + assertFileRegular(fileIdAtriple, "file-A''' must be REGULAR"); + assertNotNull("file-B must still exist (not GCed)", metadataService.getFileById(fileIdB)); + assertNotNull("file-C must still exist", metadataService.getFileById(fileIdC)); + + // Old generations gone from catalog + assertFileGone(fileIdA, "file-A should be gone from catalog"); + assertFileGone(fileIdAprime, "file-A' should be gone from catalog"); + assertFileGone(fileIdAdoubleprime, "file-A'' should be gone from catalog"); + + // Physical files from generations 1 and 2 cleaned up + assertFalse("file-A physical should not exist", fileStorage.exists(pathA)); + assertFalse("file-A' physical should not exist", fileStorage.exists(pathAprime)); + + // file-B visibility: rows 0,1,2 deleted at ts 350,360,370 + // After garbageCollect(safeGcTs3=450), baseTimestamp is 450 so only ts >= 450 can be queried. + // At ts=450 all three deletes (350,360,370) are baked into the base bitmap. + for (long snapTs : new long[]{450L, 500L}) + { + long[] bmB = retinaManager.queryVisibility(fileIdB, 0, snapTs, 0L); + for (int r = 0; r < 3; r++) + { + assertTrue("file-B snap_ts=" + snapTs + " row " + r + " should be deleted", + isBitSet(bmB, r)); + } + for (int r = 3; r < numRowsB; r++) + { + assertFalse("file-B snap_ts=" + snapTs + " row " + r + " should not be deleted", + isBitSet(bmB, r)); + } + } + + // file-C: no deletions at any snap_ts (only ts >= safeGcTs3 queryable) + for (long snapTs : new long[]{450L, 500L}) + { + long[] bmC = retinaManager.queryVisibility(fileIdC, 0, snapTs, 0L); + assertEquals("file-C should have no deletions at snap_ts=" + snapTs, + 0L, bmC[0]); + } + + // file-A''' (latest generation): verify visibility matches expectations + // It has 2 rows (originally from file-A, ids 150 and 190). + // No deletions were applied after safeGcTs3 to this file. + for (long snapTs : new long[]{450L, 500L, 1000L}) + { + long[] bmAtriple = retinaManager.queryVisibility(fileIdAtriple, 0, snapTs, 0L); + for (int r = 0; r < 2; r++) + { + assertFalse("file-A''' snap_ts=" + snapTs + " row " + r + " should not be deleted", + isBitSet(bmAtriple, r)); + } + } + } + + // ======================================================================= + // Helpers: state management + // ======================================================================= + + /** Clears RetinaResourceManager internal maps and resets latestGcTimestamp. */ + private void resetManagerState() + { + try + { + Field rgMapField = RetinaResourceManager.class.getDeclaredField("rgVisibilityMap"); + rgMapField.setAccessible(true); + ((Map) rgMapField.get(retinaManager)).clear(); + + Field gcTsField = RetinaResourceManager.class.getDeclaredField("latestGcTimestamp"); + gcTsField.setAccessible(true); + gcTsField.setLong(retinaManager, -1L); + + Field dwField = RetinaResourceManager.class.getDeclaredField("dualWriteLookup"); + dwField.setAccessible(true); + ((Map) dwField.get(retinaManager)).clear(); + + Field dualWriteField = RetinaResourceManager.class.getDeclaredField("isDualWriteActive"); + dualWriteField.setAccessible(true); + dualWriteField.setBoolean(retinaManager, false); + + Field retiredField = RetinaResourceManager.class.getDeclaredField("retiredFiles"); + retiredField.setAccessible(true); + ((java.util.Queue) retiredField.get(retinaManager)).clear(); + } + catch (Exception e) + { + throw new RuntimeException("Failed to reset RetinaResourceManager state", e); + } + } + + /** Returns the private {@code rgVisibilityMap} from {@link RetinaResourceManager} via reflection. */ + @SuppressWarnings("unchecked") + private Map getRgVisibilityMap() + { + try + { + Field f = RetinaResourceManager.class.getDeclaredField("rgVisibilityMap"); + f.setAccessible(true); + return (Map) f.get(retinaManager); + } + catch (Exception e) + { + throw new RuntimeException("Failed to access rgVisibilityMap", e); + } + } + + // ======================================================================= + // Helpers: catalog registration + // ======================================================================= + + private long registerTestFile(String name, File.Type type, + int numRg, long minRow, long maxRow) + throws Exception + { + File f = new File(); + f.setName(name); + f.setType(type); + f.setNumRowGroup(numRg); + f.setMinRowId(minRow); + f.setMaxRowId(maxRow); + f.setPathId(testPathId); + metadataService.addFiles(Collections.singletonList(f)); + long id = metadataService.getFileId(testOrderedPathUri + "/" + name); + assertTrue(name + " must have valid id", id > 0); + return id; + } + + private long[] registerTestFiles(String[] names, File.Type[] types, + int[] numRgs, long[] minRows, long[] maxRows) + throws Exception + { + List files = new ArrayList<>(); + for (int i = 0; i < names.length; i++) + { + File f = new File(); + f.setName(names[i]); + f.setType(types[i]); + f.setNumRowGroup(numRgs[i]); + f.setMinRowId(minRows[i]); + f.setMaxRowId(maxRows[i]); + f.setPathId(testPathId); + files.add(f); + } + metadataService.addFiles(files); + long[] ids = new long[names.length]; + for (int i = 0; i < names.length; i++) + { + ids[i] = metadataService.getFileId(testOrderedPathUri + "/" + names[i]); + assertTrue(names[i] + " must have valid id", ids[i] > 0); + } + return ids; + } + + // ======================================================================= + // Helpers: bitmap and catalog assertions + // ======================================================================= + + private static boolean isBitSet(long[] bitmap, int row) + { + return (bitmap[row / 64] & (1L << (row % 64))) != 0; + } + + private void assertFileGone(long fileId, String msg) throws Exception + { + File f = metadataService.getFileById(fileId); + assertTrue(msg, f == null || f.getId() == 0); + } + + private void assertFileRegular(long fileId, String msg) throws Exception + { + File f = metadataService.getFileById(fileId); + assertNotNull(msg, f); + assertEquals(msg, File.Type.REGULAR, f.getType()); + } + + // ======================================================================= + // Helpers: GC factory for grouping tests + // ======================================================================= + + private static StorageGarbageCollector newGcForGrouping( + long targetFileSize, int maxFilesPerGroup, int maxGroups) + { + return new StorageGarbageCollector( + null, null, 0.5, targetFileSize, maxFilesPerGroup, maxGroups, + 1048576, EncodingLevel.EL2, 86_400_000L); + } + + // ======================================================================= + // Helpers: RewriteResult validation + // ======================================================================= + + /** + * Verifies the structural consistency of a {@link StorageGarbageCollector.RewriteResult}: + *
      + *
    • {@code newFileRgActualRecordNums} has exactly {@code newFileRgCount} entries, all positive
    • + *
    • {@code newFileRgRowStart} has {@code newFileRgCount + 1} entries forming a cumulative sum
    • + *
    • The sentinel entry equals the expected total surviving rows
    • + *
    + */ + private static void assertRewriteResultConsistency( + StorageGarbageCollector.RewriteResult result, int expectedTotalRows) + { + assertTrue("newFileRgCount must be at least 1", result.newFileRgCount >= 1); + assertEquals(result.newFileRgCount, result.newFileRgActualRecordNums.length); + assertEquals(result.newFileRgCount + 1, result.newFileRgRowStart.length); + int totalRecords = 0; + for (int i = 0; i < result.newFileRgCount; i++) + { + assertTrue("RG record count must be positive", result.newFileRgActualRecordNums[i] > 0); + assertEquals("rgRowStart must be cumulative sum", totalRecords, result.newFileRgRowStart[i]); + totalRecords += result.newFileRgActualRecordNums[i]; + } + assertEquals("sentinel rgRowStart must equal total surviving rows", + expectedTotalRows, totalRecords); + assertEquals(totalRecords, result.newFileRgRowStart[result.newFileRgCount]); + } + + // ======================================================================= + // Helpers: domain object builders + // ======================================================================= + + /** Creates a minimal {@link File} domain object with given id and rgCount. */ + private static File makeFile(long fileId, int rgCount) + { + File f = new File(); + f.setId(fileId); + f.setNumRowGroup(rgCount); + return f; + } + + /** + * Creates a {@code long[]} GC snapshot bitmap for one RG where exactly {@code deletedRows} + * out of {@code totalRows} rows are marked as deleted (rows 0 .. deletedRows-1 are set). + */ + private static long[] makeBitmap(int totalRows, int deletedRows) + { + int words = (totalRows + 63) / 64; + long[] bitmap = new long[words]; + for (int r = 0; r < deletedRows; r++) + { + bitmap[r / 64] |= (1L << (r % 64)); + } + return bitmap; + } + + /** + * Creates a GC snapshot bitmap of the minimum required word length for + * {@code totalRows} rows, with each index in {@code deletedRows} set. + */ + private static long[] makeBitmapForRows(int totalRows, int... deletedRows) + { + int words = (totalRows + 63) / 64; + long[] bitmap = new long[words]; + for (int r : deletedRows) + { + bitmap[r / 64] |= (1L << (r % 64)); + } + return bitmap; + } + + /** + * Creates a per-RG stats entry {@code {recordNum, invalidCount}} for one RG + * where exactly {@code deletedRows} out of {@code totalRows} rows are deleted. + * This mirrors what {@code runGC()} pre-computes during the Memory GC pass. + */ + private static long[] makeRgStats(int totalRows, int deletedRows) + { + return new long[]{totalRows, deletedRows}; + } + + // ======================================================================= + // Helpers: Pixels file I/O (for rewrite tests) + // ======================================================================= + + /** + * Writes a single-row-group Pixels file to the shared temp directory and returns + * its {@code file://} URI. The schema must consist entirely of {@code LONG} columns. + * + *

    When {@code hasHidden=true}, an extra {@link LongColumnVector} is appended to + * {@code batch.cols} before writing so that {@link PixelsWriterImpl} stores the + * hidden {@code create_ts} column; {@code createTs[i]} is written for row {@code i}. + */ + private static String writeTestFile(String fileName, TypeDescription schema, + long[] ids, boolean hasHidden, long[] createTs) + throws Exception + { + int n = ids.length; + int nUserCols = schema.getChildren().size(); + String path = testOrderedPathUri + "/" + fileName; + + VectorizedRowBatch batch = schema.createRowBatch(n); + if (hasHidden) + { + batch.cols = Arrays.copyOf(batch.cols, nUserCols + 1); + batch.cols[nUserCols] = new LongColumnVector(n); + } + for (int r = 0; r < n; r++) + { + ((LongColumnVector) batch.cols[0]).vector[r] = ids[r]; + if (hasHidden) + { + ((LongColumnVector) batch.cols[nUserCols]).vector[r] = createTs[r]; + } + } + batch.size = n; + + try (PixelsWriter writer = PixelsWriterImpl.newBuilder() + .setSchema(schema) + .setPixelStride(10_000) + .setRowGroupSize(256 * 1024 * 1024) + .setStorage(fileStorage) + .setPath(path) + .setOverwrite(true) + .setEncodingLevel(EncodingLevel.EL0) + .setCompressionBlockSize(1) + .setHasHiddenColumn(hasHidden) + .build()) + { + writer.addRowBatch(batch); + } + return path; + } + + /** + * Writes a multi-row-group Pixels file to the shared temp directory and returns + * its {@code file://} URI. Each of the {@code numRgs} row groups contains exactly + * {@code rowsPerRg} rows, written as separate {@code addRowBatch} calls so that + * the Pixels writer flushes one RG per call. Row values are sequential integers + * starting from 0. + */ + private static String writeTestFileMultiRg(String fileName, TypeDescription schema, + int numRgs, int rowsPerRg) throws Exception + { + return writeTestFileMultiRg(fileName, schema, numRgs, rowsPerRg, 10_000); + } + + private static String writeTestFileMultiRg(String fileName, TypeDescription schema, + int numRgs, int rowsPerRg, + int pixelStride) throws Exception + { + String path = testOrderedPathUri + "/" + fileName; + try (PixelsWriter writer = PixelsWriterImpl.newBuilder() + .setSchema(schema) + .setPixelStride(pixelStride) + .setRowGroupSize(rowsPerRg) + .setStorage(fileStorage) + .setPath(path) + .setOverwrite(true) + .setEncodingLevel(EncodingLevel.EL0) + .setCompressionBlockSize(1) + .setHasHiddenColumn(false) + .build()) + { + for (int rg = 0; rg < numRgs; rg++) + { + VectorizedRowBatch batch = schema.createRowBatch(rowsPerRg); + for (int r = 0; r < rowsPerRg; r++) + { + ((LongColumnVector) batch.cols[0]).vector[r] = (long) (rg * rowsPerRg + r); + } + batch.size = rowsPerRg; + writer.addRowBatch(batch); + } + } + return path; + } + + /** + * Reads all data rows from a Pixels file and returns them as a {@code long[][]}. + * Each inner array has {@code nUserCols} entries; when {@code exposeHidden=true} + * an extra entry is appended containing the hidden {@code create_ts} value. + */ + private static long[][] readAllRows(String path, TypeDescription schema, + boolean exposeHidden) throws Exception + { + List rows = new ArrayList<>(); + int nUserCols = schema.getChildren().size(); + + try (PixelsReader reader = PixelsReaderImpl.newBuilder() + .setStorage(fileStorage).setPath(path) + .setPixelsFooterCache(new PixelsFooterCache()).build()) + { + if (reader.getRowGroupNum() == 0) + { + return new long[0][]; + } + PixelsReaderOption opt = new PixelsReaderOption(); + opt.includeCols(schema.getFieldNames().toArray(new String[0])); + opt.exposeHiddenColumn(exposeHidden); + + try (PixelsRecordReader rr = reader.read(opt)) + { + VectorizedRowBatch batch; + while ((batch = rr.readBatch()) != null && batch.size > 0) + { + for (int r = 0; r < batch.size; r++) + { + int width = exposeHidden ? nUserCols + 1 : nUserCols; + long[] row = new long[width]; + for (int c = 0; c < nUserCols; c++) + { + row[c] = ((LongColumnVector) batch.cols[c]).vector[r]; + } + if (exposeHidden && batch.getHiddenColumnVector() != null) + { + row[nUserCols] = batch.getHiddenColumnVector().vector[r]; + } + rows.add(row); + } + } + } + } + return rows.toArray(new long[0][]); + } + + /** + * Builds a {@link StorageGarbageCollector.FileGroup} with a single + * {@link StorageGarbageCollector.FileCandidate} backed by the given file path. + * The row-group count is read from the file footer so the candidate reflects reality. + * {@code pathId} is set to {@link #testPathId} so that {@code addFiles} satisfies + * the foreign key constraint against the PATHS table. + */ + private static StorageGarbageCollector.FileGroup makeGroup( + long fileId, String filePath, TypeDescription schema) throws Exception + { + int rgCount; + try (PixelsReader r = PixelsReaderImpl.newBuilder() + .setStorage(fileStorage).setPath(filePath) + .setPixelsFooterCache(new PixelsFooterCache()).build()) + { + rgCount = r.getRowGroupNum(); + } + + File f = new File(); + f.setId(fileId); + f.setNumRowGroup(rgCount); + f.setMinRowId(0L); + f.setMaxRowId(999L); + f.setPathId(testPathId); + + StorageGarbageCollector.FileCandidate fc = + new StorageGarbageCollector.FileCandidate( + f, filePath, fileId, rgCount, 1L, 0, 0.70, 0L); + + return new StorageGarbageCollector.FileGroup(1L, 0, Collections.singletonList(fc)); + } + + /** + * Builds a {@link StorageGarbageCollector.FileGroup} containing two + * {@link StorageGarbageCollector.FileCandidate} objects backed by distinct files. + * Both files share the same {@code (tableId=1, virtualNodeId=0)}. + */ + private static StorageGarbageCollector.FileGroup makeMultiFileGroup( + TypeDescription schema, + long fileIdA, String pathA, + long fileIdB, String pathB) throws Exception + { + List candidates = new ArrayList<>(); + for (long[] pair : new long[][]{{fileIdA, 0}, {fileIdB, 0}}) + { + long fid = pair[0]; + String path = (fid == fileIdA) ? pathA : pathB; + int rgCount; + try (PixelsReader r = PixelsReaderImpl.newBuilder() + .setStorage(fileStorage).setPath(path) + .setPixelsFooterCache(new PixelsFooterCache()).build()) + { + rgCount = r.getRowGroupNum(); + } + File f = new File(); + f.setId(fid); + f.setNumRowGroup(rgCount); + f.setMinRowId(fid * 1000); + f.setMaxRowId(fid * 1000 + 999); + f.setPathId(testPathId); + candidates.add(new StorageGarbageCollector.FileCandidate( + f, path, fid, rgCount, 1L, 0, 0.70, 0L)); + } + return new StorageGarbageCollector.FileGroup(1L, 0, candidates); + } + + /** + * Writes a multi-row-group Pixels file. Each element of {@code idsPerRg} + * becomes a separate row group (achieved via a tiny {@code rowGroupSize=1} + * that forces an RG flush after every batch). + * + * @param idsPerRg array of id-arrays, one per desired row group + * @param hasHidden whether to write the hidden create_ts column + * @param tsPerRg create_ts values per RG (same shape as idsPerRg), or null + */ + private static String writeMultiRgTestFile(String fileName, TypeDescription schema, + long[][] idsPerRg, boolean hasHidden, + long[][] tsPerRg) throws Exception + { + int nUserCols = schema.getChildren().size(); + String path = testOrderedPathUri + "/" + fileName; + + try (PixelsWriter writer = PixelsWriterImpl.newBuilder() + .setSchema(schema) + .setPixelStride(1) + .setRowGroupSize(1) + .setStorage(fileStorage) + .setPath(path) + .setOverwrite(true) + .setEncodingLevel(EncodingLevel.EL0) + .setCompressionBlockSize(1) + .setHasHiddenColumn(hasHidden) + .build()) + { + for (int rg = 0; rg < idsPerRg.length; rg++) + { + long[] ids = idsPerRg[rg]; + int n = ids.length; + VectorizedRowBatch batch = schema.createRowBatch(n); + if (hasHidden) + { + batch.cols = Arrays.copyOf(batch.cols, nUserCols + 1); + batch.cols[nUserCols] = new LongColumnVector(n); + } + for (int r = 0; r < n; r++) + { + ((LongColumnVector) batch.cols[0]).vector[r] = ids[r]; + if (hasHidden && tsPerRg != null) + { + ((LongColumnVector) batch.cols[nUserCols]).vector[r] = tsPerRg[rg][r]; + } + } + batch.size = n; + writer.addRowBatch(batch); + } + } + return path; + } + + // ======================================================================= + // Helpers: file cleanup + // ======================================================================= + + private static void deleteRecursive(java.io.File f) + { + if (f.isDirectory()) + { + java.io.File[] children = f.listFiles(); + if (children != null) + { + for (java.io.File child : children) + { + deleteRecursive(child); + } + } + } + f.delete(); + } + + // ======================================================================= + // S1: PxlFileType filter test + // ======================================================================= + + /** + * Verifies that {@code isGcEligible} correctly accepts ORDERED/COMPACT files + * and rejects SINGLE/COPY files as well as unrecognised paths. + */ + @Test + public void testS1_fileTypeFilter_singleAndCopyExcluded() + { + assertTrue("ordered file should be GC eligible", + PixelsFileNameUtils.isGcEligible("host1_20260401120000_1_0_ordered.pxl")); + assertTrue("compact file should be GC eligible", + PixelsFileNameUtils.isGcEligible("host1_20260401120000_2_0_compact.pxl")); + assertFalse("single file should NOT be GC eligible", + PixelsFileNameUtils.isGcEligible("host1_20260401120000_3_-1_single.pxl")); + assertFalse("copy file should NOT be GC eligible", + PixelsFileNameUtils.isGcEligible("host1_20260401120000_4_0_copy.pxl")); + assertFalse("unrecognised path should NOT be GC eligible", + PixelsFileNameUtils.isGcEligible("random_file.parquet")); + assertFalse("null path should NOT be GC eligible", + PixelsFileNameUtils.isGcEligible(null)); + assertFalse("empty path should NOT be GC eligible", + PixelsFileNameUtils.isGcEligible("")); + + assertEquals(PixelsFileNameUtils.PxlFileType.ORDERED, + PixelsFileNameUtils.extractFileType("host1_20260401120000_1_0_ordered.pxl")); + assertEquals(PixelsFileNameUtils.PxlFileType.COMPACT, + PixelsFileNameUtils.extractFileType("/some/dir/host1_20260401120000_2_5_compact.pxl")); + assertEquals(PixelsFileNameUtils.PxlFileType.SINGLE, + PixelsFileNameUtils.extractFileType("host1_20260401120000_3_-1_single.pxl")); + assertEquals(PixelsFileNameUtils.PxlFileType.COPY, + PixelsFileNameUtils.extractFileType("host1_20260401120000_4_0_copy.pxl")); + } + + // ======================================================================= + // S2: Multi-column schema rewrite test + // ======================================================================= + + /** + * Rewrites a file with a multi-column schema (LONG + STRING + DOUBLE) + * and verifies all column values survive the rewrite correctly. + */ + @Test + public void testS2_multiColumnSchema_longStringDouble() throws Exception + { + TypeDescription schema = TypeDescription.fromString("struct"); + int numRows = 6; + long fileId = 5001L; + + String path = testOrderedPathUri + "/src_multi_col.pxl"; + VectorizedRowBatch batch = schema.createRowBatch(numRows); + long[] ids = {10L, 20L, 30L, 40L, 50L, 60L}; + String[] names = {"alice", "bob", "carol", "dave", "eve", "frank"}; + double[] scores = {1.1, 2.2, 3.3, 4.4, 5.5, 6.6}; + + for (int r = 0; r < numRows; r++) + { + ((LongColumnVector) batch.cols[0]).vector[r] = ids[r]; + byte[] nameBytes = names[r].getBytes(java.nio.charset.StandardCharsets.UTF_8); + ((BinaryColumnVector) batch.cols[1]).setVal(r, nameBytes); + ((DoubleColumnVector) batch.cols[2]).vector[r] = Double.doubleToLongBits(scores[r]); + } + batch.size = numRows; + + try (PixelsWriter writer = PixelsWriterImpl.newBuilder() + .setSchema(schema) + .setPixelStride(10_000) + .setRowGroupSize(256 * 1024 * 1024) + .setStorage(fileStorage) + .setPath(path) + .setOverwrite(true) + .setEncodingLevel(EncodingLevel.EL0) + .setCompressionBlockSize(1) + .setHasHiddenColumn(false) + .build()) + { + writer.addRowBatch(batch); + } + + Map bitmaps = new HashMap<>(); + bitmaps.put(fileId + "_0", makeBitmapForRows(numRows, 1, 3, 5)); + + StorageGarbageCollector.FileGroup group = makeGroup(fileId, path, schema); + StorageGarbageCollector.RewriteResult result = + gc.rewriteFileGroup(group, 100L, bitmaps); + + try (PixelsReader reader = PixelsReaderImpl.newBuilder() + .setStorage(fileStorage).setPath(result.newFilePath) + .setPixelsFooterCache(new PixelsFooterCache()).build()) + { + String[] colNames = {"id", "name", "score"}; + PixelsReaderOption option = new PixelsReaderOption(); + option.skipCorruptRecords(true); + option.tolerantSchemaEvolution(true); + option.includeCols(colNames); + + PixelsRecordReader rr = reader.read(option); + VectorizedRowBatch outBatch = rr.readBatch(); + assertEquals("3 rows should survive", 3, outBatch.size); + + long[] expectedIds = {10L, 30L, 50L}; + String[] expectedNames = {"alice", "carol", "eve"}; + double[] expectedScores = {1.1, 3.3, 5.5}; + + for (int r = 0; r < 3; r++) + { + assertEquals("id mismatch at row " + r, + expectedIds[r], ((LongColumnVector) outBatch.cols[0]).vector[r]); + String actualName = new String( + ((BinaryColumnVector) outBatch.cols[1]).vector[r], + ((BinaryColumnVector) outBatch.cols[1]).start[r], + ((BinaryColumnVector) outBatch.cols[1]).lens[r], + java.nio.charset.StandardCharsets.UTF_8); + assertEquals("name mismatch at row " + r, + expectedNames[r], actualName); + assertEquals("score mismatch at row " + r, + expectedScores[r], + Double.longBitsToDouble(((DoubleColumnVector) outBatch.cols[2]).vector[r]), + 1e-9); + } + } + + assertRewriteResultConsistency(result, 3); + } + + // ======================================================================= + // S2: Large-scale rewrite performance benchmark + // ======================================================================= + + /** + * Rewrites a file with 2000 rows (multi-RG) deleting ~50%, verifying + * correctness and serving as a performance baseline. + */ + @Test + public void testS2_largeScaleRewrite_2000rows() throws Exception + { + TypeDescription schema = LONG_ID_SCHEMA; + int totalRows = 2000; + int rowsPerRg = 500; + int numRgs = totalRows / rowsPerRg; + long fileId = 6001L; + + String srcPath = writeTestFileMultiRg("src_large.pxl", schema, numRgs, rowsPerRg, rowsPerRg); + + Map bitmaps = new HashMap<>(); + int expectedSurvivors = 0; + for (int rg = 0; rg < numRgs; rg++) + { + List deleted = new ArrayList<>(); + for (int r = 0; r < rowsPerRg; r++) + { + if (r % 2 == 0) + { + deleted.add(r); + } + } + bitmaps.put(fileId + "_" + rg, + makeBitmapForRows(rowsPerRg, deleted.stream().mapToInt(Integer::intValue).toArray())); + expectedSurvivors += (rowsPerRg - deleted.size()); + } + + StorageGarbageCollector.FileGroup group = makeGroup(fileId, srcPath, schema); + + long startNs = System.nanoTime(); + StorageGarbageCollector.RewriteResult result = + gc.rewriteFileGroup(group, 100L, bitmaps); + long elapsedMs = (System.nanoTime() - startNs) / 1_000_000; + + long[][] rows = readAllRows(result.newFilePath, schema, false); + assertEquals("surviving rows after 50% deletion", expectedSurvivors, rows.length); + + for (int i = 0; i < rows.length; i++) + { + long expectedId = (i / (rowsPerRg / 2)) * rowsPerRg + (i % (rowsPerRg / 2)) * 2 + 1; + assertEquals("id mismatch at survivor row " + i, expectedId, rows[i][0]); + } + + assertRewriteResultConsistency(result, expectedSurvivors); + + assertTrue("Large-scale rewrite should complete in under 30s, took " + elapsedMs + "ms", + elapsedMs < 30_000); + } + + // ======================================================================= + // Memory GC: gcSnapshotBitmap correctness + // ======================================================================= + + /** + * Verifies that {@code RGVisibility.garbageCollect(safeGcTs)} produces the + * correct gcSnapshotBitmap for three distinct scenarios: + *

      + *
    1. No deletions at all → all-zero bitmap
    2. + *
    3. All deletions before safeGcTs → bitmap reflects all deleted rows
    4. + *
    5. Mixed: some before, some after safeGcTs → bitmap reflects only the before-set
    6. + *
    + */ + @Test + public void testGcSnapshotBitmap_threePathCorrectness() throws Exception + { + int recordNum = 10; + + // Path 1: No deletions → all-zero bitmap + long fid1 = 7001L; + retinaManager.addVisibility(fid1, 0, recordNum, 0L, null, false); + Map rgMap = getRgVisibilityMap(); + RGVisibility rv1 = rgMap.get(RetinaUtils.buildRgKey(fid1, 0)); + assertNotNull("RGVisibility for fid1 must exist", rv1); + long[] bm1 = rv1.garbageCollect(100L); + for (long word : bm1) + { + assertEquals("Path 1: no deletions → bitmap must be zero", 0L, word); + } + + // Path 2: All deletions before safeGcTs → all marked + long fid2 = 7002L; + retinaManager.addVisibility(fid2, 0, recordNum, 0L, null, false); + for (int r = 0; r < recordNum; r++) + { + retinaManager.deleteRecord(fid2, 0, r, 50L); + } + RGVisibility rv2 = getRgVisibilityMap() + .get(RetinaUtils.buildRgKey(fid2, 0)); + long[] bm2 = rv2.garbageCollect(100L); + int deletedCount2 = 0; + for (long word : bm2) + { + deletedCount2 += Long.bitCount(word); + } + assertEquals("Path 2: all deleted before safeGcTs", recordNum, deletedCount2); + for (int r = 0; r < recordNum; r++) + { + assertTrue("Path 2: row " + r + " must be set", isBitSet(bm2, r)); + } + + // Path 3: Mixed — delete rows 0,2,4 at ts=50 and rows 1,3 at ts=150 + long fid3 = 7003L; + retinaManager.addVisibility(fid3, 0, recordNum, 0L, null, false); + for (int r : new int[]{0, 2, 4}) + { + retinaManager.deleteRecord(fid3, 0, r, 50L); + } + for (int r : new int[]{1, 3}) + { + retinaManager.deleteRecord(fid3, 0, r, 150L); + } + RGVisibility rv3 = getRgVisibilityMap() + .get(RetinaUtils.buildRgKey(fid3, 0)); + long[] bm3 = rv3.garbageCollect(100L); + + for (int r : new int[]{0, 2, 4}) + { + assertTrue("Path 3: row " + r + " (deleted at ts=50 < safeGcTs=100) must be set", + isBitSet(bm3, r)); + } + for (int r : new int[]{1, 3}) + { + assertFalse("Path 3: row " + r + " (deleted at ts=150 > safeGcTs=100) must NOT be set", + isBitSet(bm3, r)); + } + for (int r : new int[]{5, 6, 7, 8, 9}) + { + assertFalse("Path 3: row " + r + " (not deleted) must NOT be set", + isBitSet(bm3, r)); + } + } + + // ======================================================================= + // Inner classes: DirectScanStorageGC stub (for scan/grouping tests) + // ======================================================================= + + /** Represents a fake catalog file entry used by {@link DirectScanStorageGC}. */ + static class FakeFileEntry + { + final long fileId; + final int rgCount; + final long tableId; + final int virtualNodeId; + + FakeFileEntry(long fileId, int rgCount, long tableId, int virtualNodeId) + { + this.fileId = fileId; + this.rgCount = rgCount; + this.tableId = tableId; + this.virtualNodeId = virtualNodeId; + } + } + + /** + * StorageGarbageCollector subclass that replaces the metadata scan loop with + * a caller-supplied list of {@link FakeFileEntry} objects, while reusing the + * real invalidRatio computation and {@link StorageGarbageCollector#groupAndMerge} logic. + */ + static class DirectScanStorageGC extends StorageGarbageCollector + { + private final List fakeEntries; + + DirectScanStorageGC(RetinaResourceManager rm, double threshold, + int maxGroups, List fakeEntries) + { + super(rm, null, threshold, 134_217_728L, Integer.MAX_VALUE, maxGroups, + 1048576, EncodingLevel.EL2, 86_400_000L); + this.fakeEntries = fakeEntries; + } + + @Override + List scanAndGroupFiles(Set candidateFileIds, + Map fileStats) + { + List candidates = new ArrayList<>(); + for (FakeFileEntry entry : fakeEntries) + { + if (!candidateFileIds.contains(entry.fileId)) + { + continue; + } + long[] stats = fileStats.get(entry.fileId); + if (stats == null || stats[0] == 0) + { + continue; + } + double ratio = (double) stats[1] / stats[0]; + candidates.add(new FileCandidate( + makeFile(entry.fileId, entry.rgCount), + "fake_" + entry.fileId + "_0_" + entry.virtualNodeId + "_ordered.pxl", + entry.fileId, entry.rgCount, + entry.tableId, entry.virtualNodeId, + ratio, 0L)); + } + return groupAndMerge(candidates); + } + + @Override + void processFileGroups(List fileGroups, long safeGcTs, + Map gcSnapshotBitmaps) + { + } + } + + /** + * StorageGarbageCollector subclass where {@code rewriteFileGroup} throws on + * the first call and succeeds (cleaning up bitmaps) on subsequent calls. + * Used by {@link #testProcessFileGroups_firstGroupFailsSecondContinues} to + * verify that {@code processFileGroups} catches per-group failures, cleans + * up bitmaps, and continues to the next group. + */ + static class FailFirstGroupGC extends StorageGarbageCollector + { + private boolean firstCall = true; + + FailFirstGroupGC() + { + super(null, null, 0.5, 0L, Integer.MAX_VALUE, 10, + 1048576, EncodingLevel.EL2, 86_400_000L); + } + + @Override + RewriteResult rewriteFileGroup(FileGroup group, long safeGcTs, + Map gcSnapshotBitmaps) throws Exception + { + if (firstCall) + { + firstCall = false; + throw new RuntimeException("simulated rewrite failure"); + } + for (FileCandidate fc : group.files) + { + for (int rgId = 0; rgId < fc.rgCount; rgId++) + { + gcSnapshotBitmaps.remove(RetinaUtils.buildRgKey(fc.fileId, rgId)); + } + } + return new RewriteResult(group, "stub", -1, + 0, new int[0], new int[]{0}, new HashMap<>(), Collections.emptyList(), + Collections.emptyList()); + } + } + + /** + * StorageGarbageCollector subclass that stubs out {@code syncIndex} so that + * the full pipeline can be tested without requiring a configured MainIndex. + * All other methods (rewrite, dual-write, visibility sync, commit) use real code. + */ + static class NoIndexSyncGC extends StorageGarbageCollector + { + NoIndexSyncGC(RetinaResourceManager rm, MetadataService ms, + double threshold, long targetFileSize, int maxFilesPerGroup, + int maxGroups, int rowGroupSize, EncodingLevel encodingLevel, + long retireDelayMs) + { + super(rm, ms, threshold, targetFileSize, maxFilesPerGroup, maxGroups, + rowGroupSize, encodingLevel, retireDelayMs); + } + + @Override + void syncIndex(RewriteResult result, long tableId) throws Exception + { + } + } +} diff --git a/proto/metadata.proto b/proto/metadata.proto index 61c75c0b52..575b868918 100644 --- a/proto/metadata.proto +++ b/proto/metadata.proto @@ -70,6 +70,8 @@ service MetadataService { rpc GetFileType (GetFileTypeRequest) returns (GetFileTypeResponse); rpc UpdateFile (UpdateFileRequest) returns (UpdateFileResponse); rpc DeleteFiles (DeleteFilesRequest) returns (DeleteFilesResponse); + rpc GetFileById (GetFileByIdRequest) returns (GetFileByIdResponse); + rpc AtomicSwapFiles (AtomicSwapFilesRequest) returns (AtomicSwapFilesResponse); rpc CreatePeerPath (CreatePeerPathRequest) returns (CreatePeerPathResponse); rpc GetPeerPaths (GetPeerPathsRequest) returns (GetPeerPathsResponse); rpc UpdatePeerPath (UpdatePeerPathRequest) returns (UpdatePeerPathResponse); @@ -725,6 +727,26 @@ message DeleteFilesResponse { ResponseHeader header = 1; } +message GetFileByIdRequest { + RequestHeader header = 1; + uint64 fileId = 2; +} + +message GetFileByIdResponse { + ResponseHeader header = 1; + File file = 2; +} + +message AtomicSwapFilesRequest { + RequestHeader header = 1; + uint64 newFileId = 2; + repeated uint64 oldFileIds = 3; +} + +message AtomicSwapFilesResponse { + ResponseHeader header = 1; +} + // peer path message CreatePeerPathRequest { RequestHeader header = 1;