Skip to content

Commit 8389d1a

Browse files
committed
Refactor workload modes and add BUILD_ONLY timing fix
Simplify WorkloadMode enum: PARTITION_ONLY/COMPACT_ONLY/COMPACT_AND_RECALL/ BUILD_FROM_SCRATCH/BUILD_ONLY collapsed into PARTITION/COMPACT/BUILD/ PARTITION_AND_COMPACT plus a separate measureRecall flag. Fix buildFromScratch timing to include PQ computation and graph construction (previously only timed the write step). Add fair comparison guidelines to CompactorBenchmark.md. Add run_compaction_perf.sh for measuring compact vs build performance.
1 parent 6178afa commit 8389d1a

4 files changed

Lines changed: 432 additions & 85 deletions

File tree

benchmarks-jmh/src/main/java/io/github/jbellis/jvector/bench/CompactorBenchmark.java

Lines changed: 68 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -107,26 +107,22 @@ public enum IndexPrecision {
107107

108108
public enum WorkloadMode {
109109
/**
110-
* Build per-source partitions and stop. (No compaction, no recall.)
110+
* Build per-source partitions and stop.
111111
*/
112-
PARTITION_ONLY,
112+
PARTITION,
113113

114114
/**
115115
* Assume partitions exist on disk; compact them.
116116
*/
117-
COMPACT_ONLY,
117+
COMPACT,
118118

119119
/**
120-
* Assume partitions exist on disk; compact them, then run recall.
120+
* Build a single graph for the whole dataset and write it.
121121
*/
122-
COMPACT_AND_RECALL,
122+
BUILD,
123123

124124
/**
125-
* Build a single graph for the whole dataset and write it. Then run recall.
126-
*/
127-
BUILD_FROM_SCRATCH,
128-
/**
129-
* (Default) Build partitions, compact them, then run recall.
125+
* (Default) Build partitions, compact them.
130126
*/
131127
PARTITION_AND_COMPACT
132128
}
@@ -278,6 +274,9 @@ private static void writeCompletedCount(int count) {
278274
@Param({"PARTITION_AND_COMPACT"})
279275
public WorkloadMode workloadMode;
280276

277+
@Param({"true"})
278+
public boolean measureRecall;
279+
281280
@Param({"4"}) // Default value, can be overridden via command line
282281
public int numPartitions;
283282

@@ -387,25 +386,10 @@ public void setup() throws Exception {
387386

388387
int dimension;
389388

390-
if (workloadMode == WorkloadMode.COMPACT_ONLY) {
391-
ds = null;
392-
queryVectors = null;
393-
groundTruth = null;
394-
ravv = null;
395-
baseVectors = null;
396-
dimension = -1;
397-
398-
var datasetInfo = DataSets.loadDataSet(datasetNames);
399-
similarityFunction = datasetInfo
400-
.flatMap(DataSetInfo::similarityFunction)
401-
.orElseGet(() -> {
402-
log.warn("Could not determine similarity function for dataset '{}'; defaulting to COSINE", datasetNames);
403-
return VectorSimilarityFunction.COSINE;
404-
});
389+
boolean needsBaseVectors = workloadMode != WorkloadMode.COMPACT;
390+
boolean needsRecallData = measureRecall && workloadMode != WorkloadMode.PARTITION;
405391

406-
log.info("Skipping dataset load for COMPACT_ONLY mode without recall. Workload: {}, similarityFunction: {}, Live nodes rate: {}",
407-
workloadMode, similarityFunction, liveNodesRate);
408-
} else {
392+
if (needsBaseVectors) {
409393
ds = DataSets.loadDataSet(datasetNames)
410394
.orElseThrow(() -> new RuntimeException("Dataset not found: " + datasetNames))
411395
.getDataSet();
@@ -425,13 +409,38 @@ public void setup() throws Exception {
425409
ravv = new ListRandomAccessVectorValues(baseVectors, ds.getDimension());
426410
}
427411

428-
queryVectors = ds.getQueryVectors();
429-
groundTruth = ds.getGroundTruth();
430412
similarityFunction = ds.getSimilarityFunction();
431413
dimension = ds.getDimension();
432414

433-
log.info("Dataset {} loaded with recall data. Base vectors: {} (portion {}), Query vectors: {}, Dim: {}, Similarity: {}, Workload: {}, Live nodes rate: {}",
434-
datasetNames, ravv.size(), datasetPortion, queryVectors.size(), dimension, similarityFunction, workloadMode, liveNodesRate);
415+
if (needsRecallData) {
416+
queryVectors = ds.getQueryVectors();
417+
groundTruth = ds.getGroundTruth();
418+
log.info("Dataset {} loaded with recall data. Base vectors: {} (portion {}), Query vectors: {}, Dim: {}, Similarity: {}, Workload: {}, measureRecall: {}, Live nodes rate: {}",
419+
datasetNames, ravv.size(), datasetPortion, queryVectors.size(), dimension, similarityFunction, workloadMode, measureRecall, liveNodesRate);
420+
} else {
421+
queryVectors = null;
422+
groundTruth = null;
423+
log.info("Dataset {} loaded (base vectors only). Base vectors: {} (portion {}), Dim: {}, Similarity: {}, Workload: {}, measureRecall: {}",
424+
datasetNames, ravv.size(), datasetPortion, dimension, similarityFunction, workloadMode, measureRecall);
425+
}
426+
} else {
427+
ds = null;
428+
queryVectors = null;
429+
groundTruth = null;
430+
ravv = null;
431+
baseVectors = null;
432+
dimension = -1;
433+
434+
var datasetInfo = DataSets.loadDataSet(datasetNames);
435+
similarityFunction = datasetInfo
436+
.flatMap(DataSetInfo::similarityFunction)
437+
.orElseGet(() -> {
438+
log.warn("Could not determine similarity function for dataset '{}'; defaulting to COSINE", datasetNames);
439+
return VectorSimilarityFunction.COSINE;
440+
});
441+
442+
log.info("Skipping dataset load for {} mode. similarityFunction: {}, Live nodes rate: {}",
443+
workloadMode, similarityFunction, liveNodesRate);
435444
}
436445

437446
// Resolve storagePaths + partitionsDir
@@ -440,22 +449,18 @@ public void setup() throws Exception {
440449
compactOutputPath = resolveCompactOutputPath(partitionsBaseDir);
441450
scratchOutputPath = resolveScratchOutputPath(partitionsBaseDir);
442451

443-
// Clean stale artifacts only if we're going to rebuild them.
444-
if (workloadMode == WorkloadMode.COMPACT_ONLY || workloadMode == WorkloadMode.COMPACT_AND_RECALL) {
445-
// For compact-only and compact-and-recall, ensure the partition files exist.
452+
if (workloadMode == WorkloadMode.COMPACT) {
446453
verifyPartitionsExist(partitionsBaseDir, numPartitions);
447454
}
448455

449-
// Partition metadata for remapping (needed for compaction)
450-
if (workloadMode == WorkloadMode.PARTITION_ONLY || workloadMode == WorkloadMode.PARTITION_AND_COMPACT) {
456+
if (workloadMode == WorkloadMode.PARTITION || workloadMode == WorkloadMode.PARTITION_AND_COMPACT) {
451457
var partitionedData = DataSetPartitioner.partition(baseVectors, numPartitions, splitDistribution);
452458
vectorsPerSourceCount = partitionedData.sizes;
453459
} else {
454460
vectorsPerSourceCount = null;
455461
}
456462

457-
// Build partitions during setup for SEGMENTS_* (matches original benchmark structure)
458-
if (workloadMode == WorkloadMode.PARTITION_ONLY || workloadMode == WorkloadMode.PARTITION_AND_COMPACT) {
463+
if (workloadMode == WorkloadMode.PARTITION || workloadMode == WorkloadMode.PARTITION_AND_COMPACT) {
459464
if (jfrPartitioning) {
460465
jfrPartitioningRecorder.start(JFR_DIR, "partitioning-" + jfrParamSuffix() + ".jfr", jfrObjectCount);
461466
}
@@ -472,8 +477,8 @@ public void setup() throws Exception {
472477
}
473478

474479
private void validateParams() {
475-
if (workloadMode == WorkloadMode.BUILD_FROM_SCRATCH) {
476-
log.warn("numPartitions={} ignored in BUILD_FROM_SCRATCH mode", numPartitions);
480+
if (workloadMode == WorkloadMode.BUILD) {
481+
log.warn("numPartitions={} ignored in BUILD mode", numPartitions);
477482
}
478483
else {
479484
if (numPartitions <= 1) throw new IllegalArgumentException("numPartitions must be larger than one");
@@ -570,7 +575,7 @@ private void verifyPartitionsExist(Path partitionsDir, int numPartitions) {
570575
for (int i = 0; i < numPartitions; i++) {
571576
Path seg = partitionsDir.resolve("per-source-graph-" + i);
572577
if (!Files.exists(seg)) {
573-
throw new IllegalStateException("Missing partition file for COMPACT_ONLY or COMPACT_AND_RECALL: " + seg.toAbsolutePath());
578+
throw new IllegalStateException("Missing partition file for COMPACT mode: " + seg.toAbsolutePath());
574579
}
575580
}
576581
}
@@ -701,6 +706,14 @@ private long buildFromScratch(List<VectorFloat<?>> baseVectors) throws Exception
701706

702707
int dimension = baseVectors.get(0).length();
703708
var full = new ListRandomAccessVectorValues(baseVectors, dimension);
709+
710+
log.info("Building from scratch: vectors={} dim={} sim={} deg={} bw={} precision={} pwThreads={} vp={} -> {}",
711+
full.size(), dimension, similarityFunction,
712+
graphDegree, beamWidth, indexPrecision, parallelWriteThreads, resolvedVectorizationProvider,
713+
scratchOutputPath.toAbsolutePath());
714+
715+
long startNanos = System.nanoTime();
716+
704717
ProductQuantization pq = null;
705718
PQVectors pqVectors = null;
706719
BuildScoreProvider bsp;
@@ -714,11 +727,6 @@ private long buildFromScratch(List<VectorFloat<?>> baseVectors) throws Exception
714727
bsp = BuildScoreProvider.randomAccessScoreProvider(full, similarityFunction);
715728
}
716729

717-
log.info("Building from scratch: vectors={} dim={} sim={} deg={} bw={} precision={} pwThreads={} vp={} -> {}",
718-
full.size(), dimension, similarityFunction,
719-
graphDegree, beamWidth, indexPrecision, parallelWriteThreads, resolvedVectorizationProvider,
720-
scratchOutputPath.toAbsolutePath());
721-
722730
var builder = new GraphIndexBuilder(bsp, dimension, graphDegree, beamWidth, 1.2f, 1.2f, true);
723731
var graph = builder.build(full);
724732

@@ -730,19 +738,10 @@ private long buildFromScratch(List<VectorFloat<?>> baseVectors) throws Exception
730738

731739
writerBuilder.with(new InlineVectors(dimension));
732740

733-
// ProductQuantization pq = null;
734-
// PQVectors pqVectors = null;
735-
// if (indexPrecision == IndexPrecision.FUSEDPQ) {
736-
// boolean centerData = similarityFunction == VectorSimilarityFunction.EUCLIDEAN;
737-
// pq = ProductQuantization.compute(full, dimension / 8, 256, centerData);
738-
// pqVectors = (PQVectors) pq.encodeAll(full);
739-
// writerBuilder.with(new FusedPQ(graph.maxDegree(), pq));
740-
// }
741741
if (indexPrecision == IndexPrecision.FUSEDPQ) {
742742
writerBuilder.with(new FusedPQ(graph.maxDegree(), pq));
743743
}
744744

745-
long startNanos = System.nanoTime();
746745
try (var writer = writerBuilder.build()) {
747746
var suppliers = new EnumMap<FeatureId, IntFunction<Feature.State>>(FeatureId.class);
748747
suppliers.put(FeatureId.INLINE_VECTORS, ord -> new InlineVectors.State(full.getVector(ord)));
@@ -814,27 +813,28 @@ public void run(Blackhole blackhole, RecallResult recallResult) throws Exception
814813

815814
// Execute workload
816815
switch (workloadMode) {
817-
case PARTITION_ONLY:
816+
case PARTITION:
818817
break;
819818

820-
case COMPACT_ONLY:
819+
case COMPACT:
821820
durationMs = compactPartitions();
821+
if (measureRecall) {
822+
recall = runRecall(compactOutputPath);
823+
}
822824
break;
823825

824-
case COMPACT_AND_RECALL:
825-
durationMs = compactPartitions();
826-
recall = runRecall(compactOutputPath);
827-
break;
828-
829-
case BUILD_FROM_SCRATCH: {
826+
case BUILD:
830827
durationMs = buildFromScratch(baseVectors);
831-
recall = runRecall(scratchOutputPath);
828+
if (measureRecall) {
829+
recall = runRecall(scratchOutputPath);
830+
}
832831
break;
833-
}
834832

835833
case PARTITION_AND_COMPACT:
836834
durationMs = compactPartitions();
837-
recall = runRecall(compactOutputPath);
835+
if (measureRecall) {
836+
recall = runRecall(compactOutputPath);
837+
}
838838
break;
839839

840840
default:
@@ -902,6 +902,7 @@ private LinkedHashMap<String, Object> buildParams() {
902902
params.put("parallelWriteThreads", parallelWriteThreads);
903903
params.put("vectorizationProvider", resolvedVectorizationProvider);
904904
params.put("datasetPortion", datasetPortion);
905+
params.put("measureRecall", measureRecall);
905906
params.put("jfrPartitioning", jfrPartitioning);
906907
params.put("jfrCompacting", jfrCompacting);
907908
params.put("jfrObjectCount", jfrObjectCount);

benchmarks-jmh/src/main/java/io/github/jbellis/jvector/bench/CompactorBenchmark.md

Lines changed: 35 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,33 @@
2424

2525
| Mode | Description |
2626
|------|-------------|
27-
| `PARTITION_AND_COMPACT` | **(default)** Build partitions, compact them, then measure recall — all in one run |
28-
| `PARTITION_ONLY` | Build N partition indexes and exit; no compaction |
29-
| `COMPACT_ONLY` | Compact existing partitions without loading the dataset |
30-
| `BUILD_FROM_SCRATCH` | Build a single index over the full dataset |
27+
| `PARTITION_AND_COMPACT` | **(default)** Build partitions, compact them |
28+
| `PARTITION` | Build N partition indexes and exit; no compaction |
29+
| `COMPACT` | Compact existing partitions |
30+
| `BUILD` | Build a single index over the full dataset |
31+
32+
## measureRecall
33+
34+
Set `-p measureRecall=false` to skip recall measurement. For `COMPACT` mode this also
35+
skips dataset loading entirely, since query vectors and ground truth are not needed.
36+
37+
## Fair comparison guidelines
38+
39+
**Recall**: use `PARTITION_AND_COMPACT` vs `BUILD`, both with `measureRecall=true` and
40+
the same dataset, `indexPrecision`, `graphDegree`, and `beamWidth`. Both modes search
41+
using FusedPQ with FP reranking. The `recall` field in the JSONL output is directly
42+
comparable.
43+
44+
**Build performance**: use `COMPACT` vs `BUILD`, both with `measureRecall=false`.
45+
The `durationMs` field measures only the graph construction pipeline (PQ training +
46+
graph build + write for `BUILD`; PQ retraining + neighbor gathering + write for
47+
`COMPACT`). Dataset loading is excluded from `durationMs` in both modes.
48+
Run `PARTITION` first to create the partition files needed by `COMPACT`.
49+
50+
**Memory footprint**: run `COMPACT` with `measureRecall=false` and a small `-Xmx`
51+
(e.g., 5g). Since `COMPACT` does not load the dataset into heap, the heap limit
52+
reflects only the compactor's own memory usage. `BUILD` always requires the full
53+
dataset in memory, so its heap requirement scales with dataset size.
3154

3255
---
3356

@@ -53,7 +76,7 @@ java -Xmx220g --add-modules jdk.incubator.vector \
5376

5477
# 3. Measuring Peak Heap During Compaction
5578

56-
The two-step workflow (`PARTITION_ONLY``COMPACT_ONLY`) exists to isolate compaction's true memory footprint. In `PARTITION_AND_COMPACT` mode the dataset is still resident in heap during compaction, which inflates the apparent memory cost. `COMPACT_ONLY` skips dataset loading entirely, so the heap limit applies only to the compactor itself.
79+
The two-step workflow (`PARTITION``COMPACT` with `measureRecall=false`) exists to isolate compaction's true memory footprint. In `PARTITION_AND_COMPACT` mode the dataset is still resident in heap during compaction, which inflates the apparent memory cost. `COMPACT` with `measureRecall=false` skips dataset loading entirely, so the heap limit applies only to the compactor itself.
5780

5881
This lets you prove that compaction can run on machines with very little RAM — e.g., `-Xmx5g` is sufficient even for large datasets.
5982

@@ -65,7 +88,7 @@ Run with a large heap since the full dataset must be loaded into memory.
6588
java -Xmx220g --add-modules jdk.incubator.vector \
6689
-cp benchmarks-jmh/target/compactor-benchmark.jar \
6790
io.github.jbellis.jvector.bench.CompactorBenchmark \
68-
-p workloadMode=PARTITION_ONLY \
91+
-p workloadMode=PARTITION \
6992
-p datasetNames=ada002-100k \
7093
-p numPartitions=4 \
7194
-p splitDistribution=FIBONACCI \
@@ -83,7 +106,8 @@ The dataset is **not** loaded in this mode. Use a small `-Xmx` to measure and pr
83106
java -Xmx5g --add-modules jdk.incubator.vector \
84107
-cp benchmarks-jmh/target/compactor-benchmark.jar \
85108
io.github.jbellis.jvector.bench.CompactorBenchmark \
86-
-p workloadMode=COMPACT_ONLY \
109+
-p workloadMode=COMPACT \
110+
-p measureRecall=false \
87111
-p datasetNames=ada002-100k \
88112
-p numPartitions=4 \
89113
-p splitDistribution=FIBONACCI \
@@ -100,7 +124,8 @@ java -Xmx5g --add-modules jdk.incubator.vector \
100124
| Parameter | Default | Description |
101125
|-----------|---------|-------------|
102126
| `datasetNames` | `ada002-100k` | Dataset name |
103-
| `workloadMode` | `PARTITION_AND_COMPACT` | Which phase(s) to run |
127+
| `workloadMode` | `PARTITION_AND_COMPACT` | Which phase(s) to run (`PARTITION`, `COMPACT`, `BUILD`, `PARTITION_AND_COMPACT`) |
128+
| `measureRecall` | `true` | Whether to run recall measurement after building/compacting |
104129
| `numPartitions` | `4` | Number of source partition indexes |
105130
| `splitDistribution` || Data partitioning strategy (`UNIFORM`, `FIBONACCI`, …) |
106131
| `indexPrecision` || `FULLPRECISION` (inline vectors only) or `FUSEDPQ` (inline + FusedPQ) |
@@ -132,11 +157,11 @@ Key fields:
132157
| Field | Description |
133158
|-------|-------------|
134159
| `durationMs` | Time spent in the measured phase only |
135-
| `recall` | Recall@10 (present when workload mode includes recall, e.g. `PARTITION_AND_COMPACT`) |
160+
| `recall` | Recall@10 (present when `measureRecall=true`) |
136161
| `peakHeapMb` | Peak JVM heap observed during the run |
137162

138163
---
139164

140165
# 7. Memory Footprint
141166

142-
All datasets in the recall table (see `docs/compaction.md`) can be run under `COMPACT_ONLY` with `-Xmx5g`. Compaction also successfully scales to a dataset with 2560 dimensions and 10M vectors under the same constraint.
167+
All datasets in the recall table (see `docs/compaction.md`) can be run under `COMPACT` with `measureRecall=false` and `-Xmx5g`. Compaction also successfully scales to a dataset with 2560 dimensions and 10M vectors under the same constraint.

0 commit comments

Comments
 (0)