Skip to content

Commit c23e996

Browse files
committed
Add dense bulk fast path for multi-dimensional points via BinaryColumn
1 parent 9f30c43 commit c23e996

8 files changed

Lines changed: 910 additions & 9 deletions

File tree

lucene/CHANGES.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,8 @@ API Changes
100100

101101
New Features
102102
---------------------
103+
* GITHUB#16209: Add dense bulk path for multi-dimensional point fields via BinaryColumn. (Prithvi S)
104+
103105
* GITHUB#15505: Upgrade snowball to 2d2e312df56f2ede014a4ffb3e91e6dea43c24be. New stemmer: PolishStemmer (and
104106
PolishSnowballAnalyzer in the stempel package) (Justas Sakalauskas, Dawid Weiss)
105107

lucene/core/src/java/org/apache/lucene/document/column/BinaryColumn.java

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@
2121
import org.apache.lucene.util.BytesRef;
2222

2323
/**
24-
* A {@link Column} that provides variable-size binary values via a tuple cursor. Used for {@link
24+
* A {@link Column} that provides variable-size binary values via a tuple cursor, and dense values
25+
* via a {@link BinaryValuesCursor}. Used for {@link
2526
* org.apache.lucene.index.DocValuesType#BINARY BINARY}, {@link
2627
* org.apache.lucene.index.DocValuesType#SORTED SORTED}, and {@link
2728
* org.apache.lucene.index.DocValuesType#SORTED_SET SORTED_SET} doc values, and for stored/indexed
@@ -54,4 +55,15 @@ public StoredValue.Type storedType() {
5455

5556
/** Returns a fresh tuple cursor starting at the beginning of the batch. */
5657
public abstract ObjectTupleCursor<BytesRef> tuples();
58+
59+
/**
60+
* Returns a fresh values cursor iterating dense {@link BytesRef} values for doc-ids {@code [0,
61+
* numDocs)}. Must be overridden when {@link #density()} is {@link Column.Density#DENSE DENSE};
62+
* the default implementation throws {@link UnsupportedOperationException} and is never called for
63+
* {@link Column.Density#SPARSE SPARSE} columns.
64+
*/
65+
public BinaryValuesCursor values() {
66+
throw new UnsupportedOperationException(
67+
"values() requires density() == DENSE for column \"" + name() + "\"");
68+
}
5769
}
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.lucene.document.column;
18+
19+
import org.apache.lucene.util.BytesRef;
20+
21+
/**
22+
* A values cursor over a dense {@link BinaryColumn}. The cursor produces exactly {@link #size()}
23+
* {@link BytesRef} values for consecutive batch-local doc-ids starting at 0, one per call to {@link
24+
* #nextValue()}.
25+
*
26+
* @lucene.experimental
27+
*/
28+
public abstract class BinaryValuesCursor {
29+
30+
private final int size;
31+
32+
/**
33+
* Creates a cursor that will produce exactly {@code size} values, one per batch-local doc-id in
34+
* {@code [0, size)}. {@code size} is fixed for the cursor's lifetime and must equal the dense
35+
* column's {@code numDocs}.
36+
*
37+
* <p>Lucene's internal indexing paths will not consume past {@code size}. Defensive throws on
38+
* overrun are still encouraged to catch misuse from external callers.
39+
*/
40+
protected BinaryValuesCursor(int size) {
41+
this.size = size;
42+
}
43+
44+
/** Total number of values this cursor will produce. */
45+
public final int size() {
46+
return size;
47+
}
48+
49+
/**
50+
* Returns the next {@link BytesRef} value. Must not be called more than {@link #size()} times. The
51+
* returned {@link BytesRef} is only valid until the next call to {@link #nextValue()}.
52+
*/
53+
public abstract BytesRef nextValue();
54+
}

lucene/core/src/java/org/apache/lucene/document/column/package-info.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,9 @@
5151
* <li>A tuple cursor (e.g. {@link org.apache.lucene.document.column.LongTupleCursor}, {@link
5252
* org.apache.lucene.document.column.ObjectTupleCursor}) yields {@code (batchDocID, value)}
5353
* pairs in non-decreasing doc-id order. Always available.
54-
* <li>A bulk values cursor (e.g. {@link org.apache.lucene.document.column.LongValuesCursor})
55-
* feeds dense data directly into the underlying writer. Required when {@link
54+
* <li>A bulk values cursor (e.g. {@link org.apache.lucene.document.column.LongValuesCursor},
55+
* {@link org.apache.lucene.document.column.BinaryValuesCursor}) feeds dense data directly
56+
* into the underlying writer. Required when {@link
5657
* org.apache.lucene.document.column.Column#density()} is {@link
5758
* org.apache.lucene.document.column.Column.Density#DENSE DENSE} and consulted only in that
5859
* case.

lucene/core/src/java/org/apache/lucene/index/IndexingChain.java

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
import org.apache.lucene.document.NumericDocValuesField;
4545
import org.apache.lucene.document.StoredValue;
4646
import org.apache.lucene.document.column.BinaryColumn;
47+
import org.apache.lucene.document.column.BinaryValuesCursor;
4748
import org.apache.lucene.document.column.Column;
4849
import org.apache.lucene.document.column.ColumnBatch;
4950
import org.apache.lucene.document.column.ColumnFieldAdapter;
@@ -1083,6 +1084,20 @@ private static void processBinaryColumn(
10831084
final DocValuesType dvType = fieldType.docValuesType();
10841085
final boolean hasPoints = fieldType.pointDimensionCount() != 0;
10851086
final PointValuesWriter pointWriter = hasPoints ? pf.pointValuesWriter : null;
1087+
1088+
// Dense fast path: points-only with no doc values, stored, or index options.
1089+
if (column.density() == Column.Density.DENSE
1090+
&& dvType == DocValuesType.NONE
1091+
&& hasPoints
1092+
&& fieldType.stored() == false
1093+
&& fieldType.indexOptions() == IndexOptions.NONE) {
1094+
BinaryValuesCursor cursor = column.values();
1095+
ColumnValidation.checkDenseCount(column, cursor.size(), numDocs);
1096+
int packedLength = fieldType.pointDimensionCount() * fieldType.pointNumBytes();
1097+
pointWriter.addDenseNDValues(baseDocID, cursor, packedLength);
1098+
return;
1099+
}
1100+
10861101
final ObjectTupleCursor<BytesRef> cursor = column.tuples();
10871102

10881103
if (dvType == DocValuesType.NONE) {

lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java

Lines changed: 71 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import org.apache.lucene.codecs.MutablePointTree;
2121
import org.apache.lucene.codecs.PointsReader;
2222
import org.apache.lucene.codecs.PointsWriter;
23+
import org.apache.lucene.document.column.BinaryValuesCursor;
2324
import org.apache.lucene.document.column.LongValuesCursor;
2425
import org.apache.lucene.store.DataOutput;
2526
import org.apache.lucene.util.ArrayUtil;
@@ -48,6 +49,9 @@ class PointValuesWriter {
4849
private static final int POINTS_BUFFER_INT_VALUES = POINTS_BUFFER_BYTES / Integer.BYTES;
4950
private static final int POINTS_BUFFER_LONG_VALUES = POINTS_BUFFER_BYTES / Long.BYTES;
5051

52+
/** Minimum number of values to process per chunk in the dense N-D bulk path. */
53+
private static final int MIN_VALUES_PER_CHUNK = 64;
54+
5155
private byte[] densePointsBuffer;
5256

5357
PointValuesWriter(Counter bytesUsed, FieldInfo fieldInfo) {
@@ -98,7 +102,7 @@ void addDense1DIntValues(int firstDocID, LongValuesCursor cursor) throws IOExcep
98102
if (size == 0) {
99103
return;
100104
}
101-
final long ramBefore = reserveDense1D(firstDocID, size);
105+
final long ramBefore = reserveDense(firstDocID, size);
102106
final byte[] buffer = pointsDenseBuffer();
103107
int remaining = size;
104108
while (remaining > 0) {
@@ -107,7 +111,7 @@ void addDense1DIntValues(int firstDocID, LongValuesCursor cursor) throws IOExcep
107111
bytesOut.writeBytes(buffer, 0, chunk * Integer.BYTES);
108112
remaining -= chunk;
109113
}
110-
commitDense1D(firstDocID, size, ramBefore);
114+
commitDense(firstDocID, size, ramBefore);
111115
}
112116

113117
void addDense1DLongValues(int firstDocID, LongValuesCursor cursor) throws IOException {
@@ -116,7 +120,7 @@ void addDense1DLongValues(int firstDocID, LongValuesCursor cursor) throws IOExce
116120
if (size == 0) {
117121
return;
118122
}
119-
final long ramBefore = reserveDense1D(firstDocID, size);
123+
final long ramBefore = reserveDense(firstDocID, size);
120124
final byte[] dense = pointsDenseBuffer();
121125
int remaining = size;
122126
while (remaining > 0) {
@@ -125,7 +129,53 @@ void addDense1DLongValues(int firstDocID, LongValuesCursor cursor) throws IOExce
125129
bytesOut.writeBytes(dense, 0, chunk * Long.BYTES);
126130
remaining -= chunk;
127131
}
128-
commitDense1D(firstDocID, size, ramBefore);
132+
commitDense(firstDocID, size, ramBefore);
133+
}
134+
135+
/**
136+
* Bulk-adds dense N-dimensional packed point values from a {@link BinaryValuesCursor}. Each value
137+
* is a pre-encoded packed byte array of {@code packedLength} bytes.
138+
*/
139+
void addDenseNDValues(int firstDocID, BinaryValuesCursor cursor, int packedLength)
140+
throws IOException {
141+
if (fieldInfo.getPointDimensionCount() * fieldInfo.getPointNumBytes() != packedLength) {
142+
throw new IllegalArgumentException(
143+
"field="
144+
+ fieldInfo.name
145+
+ ": packedLength="
146+
+ packedLength
147+
+ " does not match pointDimensionCount="
148+
+ fieldInfo.getPointDimensionCount()
149+
+ " * pointNumBytes="
150+
+ fieldInfo.getPointNumBytes());
151+
}
152+
final int size = cursor.size();
153+
if (size == 0) {
154+
return;
155+
}
156+
final long ramBefore = reserveDense(firstDocID, size);
157+
final byte[] buffer = pointsDenseBuffer(packedLength);
158+
final int valuesPerBuffer = buffer.length / packedLength;
159+
int remaining = size;
160+
while (remaining > 0) {
161+
int chunk = Math.min(valuesPerBuffer, remaining);
162+
for (int i = 0; i < chunk; i++) {
163+
BytesRef value = cursor.nextValue();
164+
if (value.length != packedLength) {
165+
throw new IllegalArgumentException(
166+
"field="
167+
+ fieldInfo.name
168+
+ ": point value has length="
169+
+ value.length
170+
+ " but should be "
171+
+ packedLength);
172+
}
173+
System.arraycopy(value.bytes, value.offset, buffer, i * packedLength, packedLength);
174+
}
175+
bytesOut.writeBytes(buffer, 0, chunk * packedLength);
176+
remaining -= chunk;
177+
}
178+
commitDense(firstDocID, size, ramBefore);
129179
}
130180

131181
private byte[] pointsDenseBuffer() {
@@ -135,6 +185,21 @@ private byte[] pointsDenseBuffer() {
135185
return densePointsBuffer;
136186
}
137187

188+
/**
189+
* Returns a dense buffer sized to fit at least {@code POINTS_BUFFER_BYTES} worth of packed
190+
* values, or a larger buffer if {@code packedLength} requires it to hold at least {@code
191+
* MIN_VALUES_PER_CHUNK} values per chunk.
192+
*/
193+
private byte[] pointsDenseBuffer(int packedLength) {
194+
final int minBytes = packedLength * MIN_VALUES_PER_CHUNK;
195+
if (densePointsBuffer == null) {
196+
densePointsBuffer = new byte[Math.max(POINTS_BUFFER_BYTES, minBytes)];
197+
} else if (densePointsBuffer.length < minBytes) {
198+
densePointsBuffer = new byte[minBytes];
199+
}
200+
return densePointsBuffer;
201+
}
202+
138203
private void validate1DPacked(int byteWidth) {
139204
if (fieldInfo.getPointDimensionCount() != 1 || fieldInfo.getPointNumBytes() != byteWidth) {
140205
throw new IllegalArgumentException(
@@ -149,7 +214,7 @@ private void validate1DPacked(int byteWidth) {
149214
}
150215
}
151216

152-
private long reserveDense1D(int firstDocID, int size) {
217+
private long reserveDense(int firstDocID, int size) {
153218
assert firstDocID > lastDocID
154219
: "firstDocID=" + firstDocID + " must be > lastDocID=" + lastDocID;
155220
final int oldLength = docIDs.length;
@@ -163,7 +228,7 @@ private long reserveDense1D(int firstDocID, int size) {
163228
return bytes.ramBytesUsed();
164229
}
165230

166-
private void commitDense1D(int firstDocID, int size, long ramBefore) {
231+
private void commitDense(int firstDocID, int size, long ramBefore) {
167232
iwBytesUsed.addAndGet(bytes.ramBytesUsed() - ramBefore);
168233
numDocs += size;
169234
lastDocID = firstDocID + size - 1;

lucene/core/src/test/org/apache/lucene/document/column/ColumnBatchTestUtil.java

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -578,6 +578,49 @@ public int ordValue() {
578578
}
579579
}
580580

581+
/** Dense {@link BinaryColumn} backed by a contiguous BytesRef array. */
582+
public static class ArrayDenseBinaryColumn extends BinaryColumn {
583+
private final BytesRef[] values;
584+
585+
public ArrayDenseBinaryColumn(String name, IndexableFieldType fieldType, BytesRef[] values) {
586+
super(name, fieldType, Density.DENSE);
587+
this.values = values;
588+
}
589+
590+
@Override
591+
public ObjectTupleCursor<BytesRef> tuples() {
592+
return new ObjectTupleCursor<>() {
593+
int pos = -1;
594+
595+
@Override
596+
public int nextDoc() {
597+
pos++;
598+
return pos < values.length ? pos : DocIdSetIterator.NO_MORE_DOCS;
599+
}
600+
601+
@Override
602+
public BytesRef value() {
603+
return values[pos];
604+
}
605+
};
606+
}
607+
608+
@Override
609+
public BinaryValuesCursor values() {
610+
return new BinaryValuesCursor(values.length) {
611+
int pos = 0;
612+
613+
@Override
614+
public BytesRef nextValue() {
615+
if (pos >= values.length) {
616+
throw new IllegalStateException("BinaryValuesCursor exhausted: size=" + values.length);
617+
}
618+
return values[pos++];
619+
}
620+
};
621+
}
622+
}
623+
581624
/** Dense {@link DictionaryColumn} backed by a contiguous ordinal array. */
582625
public static class ArrayDenseDictionaryColumn extends DictionaryColumn {
583626
private final int[] ords;

0 commit comments

Comments
 (0)