datastax
diff --git a/‎benchmarks-jmh/src/main/java/io/github/jbellis/jvector/bench/CompactorBenchmark.java‎
Lines changed: 68 additions & 67 deletions b/‎benchmarks-jmh/src/main/java/io/github/jbellis/jvector/bench/CompactorBenchmark.java‎
Lines changed: 68 additions & 67 deletions
diff --git a/‎benchmarks-jmh/src/main/java/io/github/jbellis/jvector/bench/CompactorBenchmark.md‎
Lines changed: 35 additions & 10 deletions b/‎benchmarks-jmh/src/main/java/io/github/jbellis/jvector/bench/CompactorBenchmark.md‎
Lines changed: 35 additions & 10 deletions
@@ -107,26 +107,22 @@ public enum IndexPrecision {
 
     public enum WorkloadMode {
         /**
-         * Build per-source partitions and stop. (No compaction, no recall.)
+         * Build per-source partitions and stop.
          */
-        PARTITION_ONLY,
+        PARTITION,
 
         /**
          * Assume partitions exist on disk; compact them.
          */
-        COMPACT_ONLY,
+        COMPACT,
 
         /**
-         * Assume partitions exist on disk; compact them, then run recall.
+         * Build a single graph for the whole dataset and write it.
          */
-        COMPACT_AND_RECALL,
+        BUILD,
 
         /**
-         * Build a single graph for the whole dataset and write it. Then run recall.
-         */
-        BUILD_FROM_SCRATCH,
-        /**
-         * (Default) Build partitions, compact them, then run recall.
+         * (Default) Build partitions, compact them.
          */
         PARTITION_AND_COMPACT
     }
@@ -278,6 +274,9 @@ private static void writeCompletedCount(int count) {
     @Param({"PARTITION_AND_COMPACT"})
     public WorkloadMode workloadMode;
 
+    @Param({"true"})
+    public boolean measureRecall;
+
     @Param({"4"}) // Default value, can be overridden via command line
     public int numPartitions;
 
@@ -387,25 +386,10 @@ public void setup() throws Exception {
 
             int dimension;
 
-            if (workloadMode == WorkloadMode.COMPACT_ONLY) {
-                ds = null;
-                queryVectors = null;
-                groundTruth = null;
-                ravv = null;
-                baseVectors = null;
-                dimension = -1;
-
-                var datasetInfo = DataSets.loadDataSet(datasetNames);
-                similarityFunction = datasetInfo
-                        .flatMap(DataSetInfo::similarityFunction)
-                        .orElseGet(() -> {
-                            log.warn("Could not determine similarity function for dataset '{}'; defaulting to COSINE", datasetNames);
-                            return VectorSimilarityFunction.COSINE;
-                        });
+            boolean needsBaseVectors = workloadMode != WorkloadMode.COMPACT;
+            boolean needsRecallData = measureRecall && workloadMode != WorkloadMode.PARTITION;
 
-                log.info("Skipping dataset load for COMPACT_ONLY mode without recall. Workload: {}, similarityFunction: {}, Live nodes rate: {}",
-                        workloadMode, similarityFunction, liveNodesRate);
-            } else {
+            if (needsBaseVectors) {
                 ds = DataSets.loadDataSet(datasetNames)
                         .orElseThrow(() -> new RuntimeException("Dataset not found: " + datasetNames))
                         .getDataSet();
@@ -425,13 +409,38 @@ public void setup() throws Exception {
                     ravv = new ListRandomAccessVectorValues(baseVectors, ds.getDimension());
                 }
 
-                queryVectors = ds.getQueryVectors();
-                groundTruth = ds.getGroundTruth();
                 similarityFunction = ds.getSimilarityFunction();
                 dimension = ds.getDimension();
 
-                log.info("Dataset {} loaded with recall data. Base vectors: {} (portion {}), Query vectors: {}, Dim: {}, Similarity: {}, Workload: {}, Live nodes rate: {}",
-                        datasetNames, ravv.size(), datasetPortion, queryVectors.size(), dimension, similarityFunction, workloadMode, liveNodesRate);
+                if (needsRecallData) {
+                    queryVectors = ds.getQueryVectors();
+                    groundTruth = ds.getGroundTruth();
+                    log.info("Dataset {} loaded with recall data. Base vectors: {} (portion {}), Query vectors: {}, Dim: {}, Similarity: {}, Workload: {}, measureRecall: {}, Live nodes rate: {}",
+                            datasetNames, ravv.size(), datasetPortion, queryVectors.size(), dimension, similarityFunction, workloadMode, measureRecall, liveNodesRate);
+                } else {
+                    queryVectors = null;
+                    groundTruth = null;
+                    log.info("Dataset {} loaded (base vectors only). Base vectors: {} (portion {}), Dim: {}, Similarity: {}, Workload: {}, measureRecall: {}",
+                            datasetNames, ravv.size(), datasetPortion, dimension, similarityFunction, workloadMode, measureRecall);
+                }
+            } else {
+                ds = null;
+                queryVectors = null;
+                groundTruth = null;
+                ravv = null;
+                baseVectors = null;
+                dimension = -1;
+
+                var datasetInfo = DataSets.loadDataSet(datasetNames);
+                similarityFunction = datasetInfo
+                        .flatMap(DataSetInfo::similarityFunction)
+                        .orElseGet(() -> {
+                            log.warn("Could not determine similarity function for dataset '{}'; defaulting to COSINE", datasetNames);
+                            return VectorSimilarityFunction.COSINE;
+                        });
+
+                log.info("Skipping dataset load for {} mode. similarityFunction: {}, Live nodes rate: {}",
+                        workloadMode, similarityFunction, liveNodesRate);
             }
 
             // Resolve storagePaths + partitionsDir
@@ -440,22 +449,18 @@ public void setup() throws Exception {
             compactOutputPath = resolveCompactOutputPath(partitionsBaseDir);
             scratchOutputPath = resolveScratchOutputPath(partitionsBaseDir);
 
-            // Clean stale artifacts only if we're going to rebuild them.
-            if (workloadMode == WorkloadMode.COMPACT_ONLY || workloadMode == WorkloadMode.COMPACT_AND_RECALL) {
-                // For compact-only and compact-and-recall, ensure the partition files exist.
+            if (workloadMode == WorkloadMode.COMPACT) {
                 verifyPartitionsExist(partitionsBaseDir, numPartitions);
             }
 
-            // Partition metadata for remapping (needed for compaction)
-            if (workloadMode == WorkloadMode.PARTITION_ONLY || workloadMode == WorkloadMode.PARTITION_AND_COMPACT) {
+            if (workloadMode == WorkloadMode.PARTITION || workloadMode == WorkloadMode.PARTITION_AND_COMPACT) {
                 var partitionedData = DataSetPartitioner.partition(baseVectors, numPartitions, splitDistribution);
                 vectorsPerSourceCount = partitionedData.sizes;
             } else {
                 vectorsPerSourceCount = null;
             }
 
-            // Build partitions during setup for SEGMENTS_* (matches original benchmark structure)
-            if (workloadMode == WorkloadMode.PARTITION_ONLY || workloadMode == WorkloadMode.PARTITION_AND_COMPACT) {
+            if (workloadMode == WorkloadMode.PARTITION || workloadMode == WorkloadMode.PARTITION_AND_COMPACT) {
                 if (jfrPartitioning) {
                     jfrPartitioningRecorder.start(JFR_DIR, "partitioning-" + jfrParamSuffix() + ".jfr", jfrObjectCount);
                 }
@@ -472,8 +477,8 @@ public void setup() throws Exception {
     }
 
     private void validateParams() {
-        if (workloadMode == WorkloadMode.BUILD_FROM_SCRATCH) {
-            log.warn("numPartitions={} ignored in BUILD_FROM_SCRATCH mode", numPartitions);
+        if (workloadMode == WorkloadMode.BUILD) {
+            log.warn("numPartitions={} ignored in BUILD mode", numPartitions);
         }
         else {
            if (numPartitions <= 1) throw new IllegalArgumentException("numPartitions must be larger than one");
@@ -570,7 +575,7 @@ private void verifyPartitionsExist(Path partitionsDir, int numPartitions) {
         for (int i = 0; i < numPartitions; i++) {
             Path seg = partitionsDir.resolve("per-source-graph-" + i);
             if (!Files.exists(seg)) {
-                throw new IllegalStateException("Missing partition file for COMPACT_ONLY or COMPACT_AND_RECALL: " + seg.toAbsolutePath());
+                throw new IllegalStateException("Missing partition file for COMPACT mode: " + seg.toAbsolutePath());
             }
         }
     }
@@ -701,6 +706,14 @@ private long buildFromScratch(List<VectorFloat<?>> baseVectors) throws Exception
 
         int dimension = baseVectors.get(0).length();
         var full = new ListRandomAccessVectorValues(baseVectors, dimension);
+
+        log.info("Building from scratch: vectors={} dim={} sim={} deg={} bw={} precision={} pwThreads={} vp={} -> {}",
+                full.size(), dimension, similarityFunction,
+                graphDegree, beamWidth, indexPrecision, parallelWriteThreads, resolvedVectorizationProvider,
+                scratchOutputPath.toAbsolutePath());
+
+        long startNanos = System.nanoTime();
+
         ProductQuantization pq = null;
         PQVectors pqVectors = null;
         BuildScoreProvider bsp;
@@ -714,11 +727,6 @@ private long buildFromScratch(List<VectorFloat<?>> baseVectors) throws Exception
             bsp = BuildScoreProvider.randomAccessScoreProvider(full, similarityFunction);
         }
 
-        log.info("Building from scratch: vectors={} dim={} sim={} deg={} bw={} precision={} pwThreads={} vp={} -> {}",
-                full.size(), dimension, similarityFunction,
-                graphDegree, beamWidth, indexPrecision, parallelWriteThreads, resolvedVectorizationProvider,
-                scratchOutputPath.toAbsolutePath());
-
         var builder = new GraphIndexBuilder(bsp, dimension, graphDegree, beamWidth, 1.2f, 1.2f, true);
         var graph = builder.build(full);
 
@@ -730,19 +738,10 @@ private long buildFromScratch(List<VectorFloat<?>> baseVectors) throws Exception
 
         writerBuilder.with(new InlineVectors(dimension));
 
-//        ProductQuantization pq = null;
-//        PQVectors pqVectors = null;
-//        if (indexPrecision == IndexPrecision.FUSEDPQ) {
-//            boolean centerData = similarityFunction == VectorSimilarityFunction.EUCLIDEAN;
-//            pq = ProductQuantization.compute(full, dimension / 8, 256, centerData);
-//            pqVectors = (PQVectors) pq.encodeAll(full);
-//            writerBuilder.with(new FusedPQ(graph.maxDegree(), pq));
-//        }
         if (indexPrecision == IndexPrecision.FUSEDPQ) {
             writerBuilder.with(new FusedPQ(graph.maxDegree(), pq));
         }
 
-        long startNanos = System.nanoTime();
         try (var writer = writerBuilder.build()) {
             var suppliers = new EnumMap<FeatureId, IntFunction<Feature.State>>(FeatureId.class);
             suppliers.put(FeatureId.INLINE_VECTORS, ord -> new InlineVectors.State(full.getVector(ord)));
@@ -814,27 +813,28 @@ public void run(Blackhole blackhole, RecallResult recallResult) throws Exception
 
             // Execute workload
             switch (workloadMode) {
-                case PARTITION_ONLY:
+                case PARTITION:
                     break;
 
-                case COMPACT_ONLY:
+                case COMPACT:
                     durationMs = compactPartitions();
+                    if (measureRecall) {
+                        recall = runRecall(compactOutputPath);
+                    }
                     break;
 
-                case COMPACT_AND_RECALL:
-                    durationMs = compactPartitions();
-                    recall = runRecall(compactOutputPath);
-                    break;
-
-                case BUILD_FROM_SCRATCH: {
+                case BUILD:
                     durationMs = buildFromScratch(baseVectors);
-                    recall = runRecall(scratchOutputPath);
+                    if (measureRecall) {
+                        recall = runRecall(scratchOutputPath);
+                    }
                     break;
-                }
 
                 case PARTITION_AND_COMPACT:
                     durationMs = compactPartitions();
-                    recall = runRecall(compactOutputPath);
+                    if (measureRecall) {
+                        recall = runRecall(compactOutputPath);
+                    }
                     break;
 
                 default:
@@ -902,6 +902,7 @@ private LinkedHashMap<String, Object> buildParams() {
         params.put("parallelWriteThreads", parallelWriteThreads);
         params.put("vectorizationProvider", resolvedVectorizationProvider);
         params.put("datasetPortion", datasetPortion);
+        params.put("measureRecall", measureRecall);
         params.put("jfrPartitioning", jfrPartitioning);
         params.put("jfrCompacting", jfrCompacting);
         params.put("jfrObjectCount", jfrObjectCount);
 
@@ -24,10 +24,33 @@
 
 | Mode | Description |
 |------|-------------|
-| `PARTITION_AND_COMPACT` | **(default)** Build partitions, compact them, then measure recall — all in one run |
-| `PARTITION_ONLY` | Build N partition indexes and exit; no compaction |
-| `COMPACT_ONLY` | Compact existing partitions without loading the dataset |
-| `BUILD_FROM_SCRATCH` | Build a single index over the full dataset |
+| `PARTITION_AND_COMPACT` | **(default)** Build partitions, compact them |
+| `PARTITION` | Build N partition indexes and exit; no compaction |
+| `COMPACT` | Compact existing partitions |
+| `BUILD` | Build a single index over the full dataset |
+
+## measureRecall
+
+Set `-p measureRecall=false` to skip recall measurement. For `COMPACT` mode this also
+skips dataset loading entirely, since query vectors and ground truth are not needed.
+
+## Fair comparison guidelines
+
+**Recall**: use `PARTITION_AND_COMPACT` vs `BUILD`, both with `measureRecall=true` and
+the same dataset, `indexPrecision`, `graphDegree`, and `beamWidth`. Both modes search
+using FusedPQ with FP reranking. The `recall` field in the JSONL output is directly
+comparable.
+
+**Build performance**: use `COMPACT` vs `BUILD`, both with `measureRecall=false`.
+The `durationMs` field measures only the graph construction pipeline (PQ training +
+graph build + write for `BUILD`; PQ retraining + neighbor gathering + write for
+`COMPACT`). Dataset loading is excluded from `durationMs` in both modes.
+Run `PARTITION` first to create the partition files needed by `COMPACT`.
+
+**Memory footprint**: run `COMPACT` with `measureRecall=false` and a small `-Xmx`
+(e.g., 5g). Since `COMPACT` does not load the dataset into heap, the heap limit
+reflects only the compactor's own memory usage. `BUILD` always requires the full
+dataset in memory, so its heap requirement scales with dataset size.
 
 ---
 
@@ -53,7 +76,7 @@ java -Xmx220g --add-modules jdk.incubator.vector \
 
 # 3. Measuring Peak Heap During Compaction
 
-The two-step workflow (`PARTITION_ONLY` → `COMPACT_ONLY`) exists to isolate compaction's true memory footprint. In `PARTITION_AND_COMPACT` mode the dataset is still resident in heap during compaction, which inflates the apparent memory cost. `COMPACT_ONLY` skips dataset loading entirely, so the heap limit applies only to the compactor itself.
+The two-step workflow (`PARTITION` → `COMPACT` with `measureRecall=false`) exists to isolate compaction's true memory footprint. In `PARTITION_AND_COMPACT` mode the dataset is still resident in heap during compaction, which inflates the apparent memory cost. `COMPACT` with `measureRecall=false` skips dataset loading entirely, so the heap limit applies only to the compactor itself.
 
 This lets you prove that compaction can run on machines with very little RAM — e.g., `-Xmx5g` is sufficient even for large datasets.
 
@@ -65,7 +88,7 @@ Run with a large heap since the full dataset must be loaded into memory.
 java -Xmx220g --add-modules jdk.incubator.vector \
   -cp benchmarks-jmh/target/compactor-benchmark.jar \
   io.github.jbellis.jvector.bench.CompactorBenchmark \
-  -p workloadMode=PARTITION_ONLY \
+  -p workloadMode=PARTITION \
   -p datasetNames=ada002-100k \
   -p numPartitions=4 \
   -p splitDistribution=FIBONACCI \
@@ -83,7 +106,8 @@ The dataset is **not** loaded in this mode. Use a small `-Xmx` to measure and pr
 java -Xmx5g --add-modules jdk.incubator.vector \
   -cp benchmarks-jmh/target/compactor-benchmark.jar \
   io.github.jbellis.jvector.bench.CompactorBenchmark \
-  -p workloadMode=COMPACT_ONLY \
+  -p workloadMode=COMPACT \
+  -p measureRecall=false \
   -p datasetNames=ada002-100k \
   -p numPartitions=4 \
   -p splitDistribution=FIBONACCI \
@@ -100,7 +124,8 @@ java -Xmx5g --add-modules jdk.incubator.vector \
 | Parameter | Default | Description |
 |-----------|---------|-------------|
 | `datasetNames` | `ada002-100k` | Dataset name |
-| `workloadMode` | `PARTITION_AND_COMPACT` | Which phase(s) to run |
+| `workloadMode` | `PARTITION_AND_COMPACT` | Which phase(s) to run (`PARTITION`, `COMPACT`, `BUILD`, `PARTITION_AND_COMPACT`) |
+| `measureRecall` | `true` | Whether to run recall measurement after building/compacting |
 | `numPartitions` | `4` | Number of source partition indexes |
 | `splitDistribution` | — | Data partitioning strategy (`UNIFORM`, `FIBONACCI`, …) |
 | `indexPrecision` | — | `FULLPRECISION` (inline vectors only) or `FUSEDPQ` (inline + FusedPQ) |
@@ -132,11 +157,11 @@ Key fields:
 | Field | Description |
 |-------|-------------|
 | `durationMs` | Time spent in the measured phase only |
-| `recall` | Recall@10 (present when workload mode includes recall, e.g. `PARTITION_AND_COMPACT`) |
+| `recall` | Recall@10 (present when `measureRecall=true`) |
 | `peakHeapMb` | Peak JVM heap observed during the run |
 
 ---
 
 # 7. Memory Footprint
 
-All datasets in the recall table (see `docs/compaction.md`) can be run under `COMPACT_ONLY` with `-Xmx5g`. Compaction also successfully scales to a dataset with 2560 dimensions and 10M vectors under the same constraint.
+All datasets in the recall table (see `docs/compaction.md`) can be run under `COMPACT` with `measureRecall=false` and `-Xmx5g`. Compaction also successfully scales to a dataset with 2560 dimensions and 10M vectors under the same constraint.