Skip to content

Commit 42bf3d0

Browse files
authored
[vector] Add unified vector index integration (#8174)
1 parent 96cd9a0 commit 42bf3d0

32 files changed

Lines changed: 3300 additions & 24 deletions
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
################################################################################
2+
# Licensed to the Apache Software Foundation (ASF) under one
3+
# or more contributor license agreements. See the NOTICE file
4+
# distributed with this work for additional information
5+
# regarding copyright ownership. The ASF licenses this file
6+
# to you under the Apache License, Version 2.0 (the
7+
# "License"); you may not use this file except in compliance
8+
# with the License. You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS,
14+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
# See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
################################################################################
18+
19+
name: UTCase Vector Index
20+
21+
on:
22+
push:
23+
paths:
24+
- 'paimon-vector/**'
25+
pull_request:
26+
paths:
27+
- 'paimon-vector/**'
28+
29+
env:
30+
JDK_VERSION: 8
31+
MAVEN_OPTS: -Dmaven.wagon.httpconnectionManager.ttlSeconds=30 -Dmaven.wagon.http.retryHandler.requestSentEnabled=true
32+
33+
concurrency:
34+
group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.event.number || github.run_id }}
35+
cancel-in-progress: true
36+
37+
jobs:
38+
vector_index_test:
39+
runs-on: ubuntu-latest
40+
41+
steps:
42+
- name: Checkout code
43+
uses: actions/checkout@v6
44+
45+
- name: Set up JDK ${{ env.JDK_VERSION }}
46+
uses: actions/setup-java@v5
47+
with:
48+
java-version: ${{ env.JDK_VERSION }}
49+
distribution: 'temurin'
50+
51+
- name: Install Rust toolchain
52+
run: |
53+
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable --profile minimal
54+
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
55+
56+
- name: Clone and build paimon-vector-index native library
57+
run: |
58+
git clone --depth 1 https://github.com/apache/paimon-vector-index.git /tmp/paimon-vector-index
59+
cd /tmp/paimon-vector-index
60+
cargo build --release -p paimon-vindex-jni
61+
62+
- name: Copy native library to resources
63+
run: |
64+
RESOURCE_DIR=paimon-vector/paimon-vector-jni/src/main/resources/native/linux-amd64
65+
mkdir -p ${RESOURCE_DIR}
66+
cp /tmp/paimon-vector-index/target/release/libpaimon_vindex_jni.so ${RESOURCE_DIR}/
67+
68+
- name: Build and test vector index modules
69+
timeout-minutes: 30
70+
run: |
71+
mvn -T 2C -B -ntp clean install -DskipTests
72+
mvn -B -ntp verify -pl paimon-vector/paimon-vector-jni,paimon-vector/paimon-vector-index -Dcheckstyle.skip=true -Dspotless.check.skip=true
73+
env:
74+
MAVEN_OPTS: -Xmx4096m

docs/docs/flink/procedures.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1004,7 +1004,7 @@ All available procedures are listed below.
10041004
To create a global index on a table for accelerating queries. Arguments:
10051005
<li>table(required): the target table identifier.</li>
10061006
<li>index_column(required): the column name to build index on.</li>
1007-
<li>index_type(required): the type of global index, supported types include 'btree', 'lumina', 'tantivy-fulltext'.</li>
1007+
<li>index_type(required): the type of global index, supported types include 'btree', 'ivf-flat', 'ivf-pq', 'ivf-hnsw-flat', 'ivf-hnsw-sq', 'tantivy-fulltext'.</li>
10081008
<li>partitions(optional): partition filter for selective index creation.</li>
10091009
<li>options(optional): additional dynamic options for index creation.</li>
10101010
</td>

docs/docs/learn-paimon/scenario-guide.mdx

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ configurations that are suited for different scenarios.
4444
| Queue-like ordered streaming | Append Table | `bucket = N, bucket-key = col` |
4545
| Large-scale OLAP with ad-hoc queries | Append Table | Incremental Clustering |
4646
| Store images / videos / documents | Append Table (Blob) | `__BLOB_FIELD` comment, Data Evolution enabled |
47-
| AI vector search / RAG | Append Table (Vector) | `VECTOR` type, Global Index (DiskANN) |
47+
| AI vector search / RAG | Append Table (Vector) | `VECTOR` type, Vector Global Index |
4848
| AI feature engineering & column evolution | Append Table | `data-evolution.enabled = true` |
4949
| Python AI pipeline (Ray / PyTorch) | Append Table | PyPaimon SDK |
5050

@@ -456,21 +456,19 @@ Schema schema = Schema.newBuilder()
456456
**Build the vector index and search:**
457457

458458
```sql
459-
-- Build DiskANN vector index
459+
-- Build IVF-PQ vector index
460460
CALL sys.create_global_index(
461461
table => 'db.doc_embeddings',
462462
index_column => 'embedding',
463-
index_type => 'lumina',
464-
options => 'lumina.index.dimension=768'
463+
index_type => 'ivf-pq',
464+
options => 'vector.distance.metric=cosine,vector.nlist=256,vector.pq.m=16'
465465
);
466466

467467
-- Search for top-5 nearest neighbors
468468
SELECT * FROM vector_search('doc_embeddings', 'embedding', array(0.1f, 0.2f, ...), 5);
469469
```
470470

471-
The legacy index type `lumina-vector-ann` is still accepted for existing tables and SQL compatibility.
472-
473-
**Why:** The [Global Index](../multimodal-table/global-index) with DiskANN provides high-performance ANN search.
471+
**Why:** The [Global Index](../multimodal-table/global-index) with vector indexes provides high-performance ANN search.
474472
Vector data is stored in dedicated `.vector.lance` files optimized for dense vectors, while scalar columns stay in
475473
Parquet. You can also build a **BTree Index** on scalar columns for efficient filtering:
476474

@@ -664,7 +662,7 @@ Do you need upsert / update / delete?
664662
665663
└── AI / Multimodal scenarios? → Enable Data Evolution
666664
├── Store images / videos / docs? → Blob Table (__BLOB_FIELD comment)
667-
├── Vector search / RAG? → VECTOR type + Global Index (DiskANN)
665+
├── Vector search / RAG? → VECTOR type + Vector Global Index
668666
├── Feature engineering? → Data Evolution (MERGE INTO partial columns)
669667
└── Python pipeline? → PyPaimon (Ray / PyTorch / Pandas)
670668
```

docs/docs/multimodal-table/global-index.mdx

Lines changed: 38 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ Global Index is a powerful indexing mechanism for Data Evolution (append) tables
3333
without full-table scans. Paimon supports multiple global index types:
3434

3535
- **BTree Index**: A B-tree based index for scalar column lookups. Supports equality, IN, range predicates, and can be combined across multiple columns with AND/OR logic.
36-
- **Vector Index**: An approximate nearest neighbor (ANN) index powered by DiskANN for vector similarity search.
36+
- **Vector Index**: An approximate nearest neighbor (ANN) index powered by Paimon's vector index library for vector similarity search.
3737
- **Full-Text Index**: A full-text search index powered by Tantivy for text retrieval. Supports term matching and relevance scoring.
3838

3939
Global indexes work on top of Data Evolution tables. To use global indexes, your table **must** have:
@@ -87,26 +87,55 @@ SELECT * FROM my_table WHERE name IN ('a200', 'a300');
8787

8888
## Vector Index
8989

90-
Vector Index provides approximate nearest neighbor (ANN) search based on the DiskANN algorithm. It is suitable for
91-
vector similarity search scenarios such as recommendation systems, image retrieval, and RAG (Retrieval Augmented
92-
Generation) applications.
90+
Vector Index provides approximate nearest neighbor (ANN) search for vector similarity search scenarios such as
91+
recommendation systems, image retrieval, and RAG (Retrieval Augmented Generation) applications.
92+
93+
Supported vector index types:
94+
95+
| Index Type | Description |
96+
|---|---|
97+
| `ivf-flat` | IVF index with flat vector storage. |
98+
| `ivf-pq` | IVF index with product quantization. |
99+
| `ivf-hnsw-flat` | IVF index with HNSW flat quantizer. |
100+
| `ivf-hnsw-sq` | IVF index with HNSW scalar quantizer. |
93101

94102
**Build Vector Index**
95103

96104
```sql
97-
-- Create Lumina vector index on 'embedding' column
105+
-- Create IVF-PQ vector index on 'embedding' column
98106
CALL sys.create_global_index(
99107
table => 'db.my_table',
100108
index_column => 'embedding',
101-
index_type => 'lumina',
102-
options => 'lumina.index.dimension=128'
109+
index_type => 'ivf-pq',
110+
options => 'ivf-pq.distance.metric=cosine,ivf-pq.nlist=256,ivf-pq.pq.m=16'
103111
);
104112
```
105113

106-
The legacy index type `lumina-vector-ann` is still accepted for existing tables and SQL compatibility.
114+
For `ARRAY<FLOAT>` vector columns, specify the vector dimension with `<index-type>.dimension`.
115+
For `VECTOR<FLOAT>` columns, Paimon uses the dimension from the column type.
116+
117+
Supported vector index options:
118+
119+
| Option | Default | Description |
120+
|---|---|---|
121+
| `<index-type>.dimension` | `128` | Vector dimension for `ARRAY<FLOAT>` columns. Ignored for `VECTOR<FLOAT>` columns. |
122+
| `<index-type>.distance.metric` | `inner_product` | Distance metric. Supported values: `l2`, `cosine`, `inner_product`. |
123+
| `<index-type>.nlist` | `256` | Number of IVF clusters used during index build. |
124+
| `<index-type>.pq.m` | `16` | Number of PQ sub-vectors for `ivf-pq`. The vector dimension must be divisible by this value. |
125+
| `<index-type>.pq.use-opq` | `false` | Whether to enable OPQ for `ivf-pq`. |
126+
| `<index-type>.hnsw.m` | `20` | HNSW graph out-degree for `ivf-hnsw-flat` and `ivf-hnsw-sq`. |
127+
| `<index-type>.hnsw.ef-construction` | `150` | HNSW construction search width for `ivf-hnsw-flat` and `ivf-hnsw-sq`. |
128+
| `<index-type>.hnsw.max-level` | `7` | Maximum HNSW level for `ivf-hnsw-flat` and `ivf-hnsw-sq`. |
107129

108130
**Vector Search**
109131

132+
Search-time options are passed with each vector search request:
133+
134+
| Option | Default | Description |
135+
|---|---|---|
136+
| `ivf.nprobe` | `16` | Number of IVF clusters to probe during search. |
137+
| `hnsw.ef_search` | `0` | HNSW search width during search. `0` uses the native library default. |
138+
110139
<Tabs groupId="vector-search">
111140

112141
<TabItem value="spark-sql" label="Spark SQL">
@@ -155,6 +184,7 @@ GlobalIndexResult result = table.newVectorSearchBuilder()
155184
.withVector(queryVector)
156185
.withLimit(5)
157186
.withVectorColumn("embedding")
187+
.withOption("ivf.nprobe", "16")
158188
.executeLocal();
159189

160190
// Step 2: Read matching rows using the search result

docs/docs/multimodal-table/index.mdx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ Key capabilities:
3737
- **[Data Evolution](./data-evolution)**: Update partial columns without rewriting entire files, enabling efficient schema evolution.
3838
- **[Blob Storage](./blob)**: Store large binary objects (images, videos, audio) in dedicated `.blob` files with efficient column projection.
3939
- **[Vector Storage](./vector)**: Store and manage vector embeddings in dedicated Vortex-format files optimized for vector workloads.
40-
- **[Global Index](./global-index)**: Build BTree, vector (DiskANN), and full-text (Tantivy) indexes for efficient lookups and similarity search.
40+
- **[Global Index](./global-index)**: Build BTree, vector, and full-text (Tantivy) indexes for efficient lookups and similarity search.
4141

4242
All multimodal features require the following table properties:
4343

paimon-common/src/main/java/org/apache/paimon/fs/VectoredReadUtils.java

Lines changed: 109 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -51,20 +51,33 @@ public static void readVectored(VectoredReadable readable, List<? extends FileRa
5151
if (ranges.isEmpty()) {
5252
return;
5353
}
54+
readVectored(readable, ranges, ReadOptions.from(readable));
55+
}
56+
57+
public static void readVectored(
58+
VectoredReadable readable, List<? extends FileRange> ranges, ReadOptions options)
59+
throws IOException {
60+
if (ranges.isEmpty()) {
61+
return;
62+
}
63+
requireNonNull(readable, "readable is null");
64+
requireNonNull(options, "options is null");
5465

5566
List<? extends FileRange> sortRanges = validateAndSortRanges(ranges);
5667
List<CombinedRange> combinedRanges =
57-
mergeSortedRanges(sortRanges, readable.minSeekForVectorReads());
68+
mergeSortedRanges(sortRanges, options.minSeekForVectorReads);
5869

59-
int parallelism = readable.parallelismForVectorReads();
70+
int parallelism = options.parallelismForVectorReads;
6071

61-
if (combinedRanges.size() == 1 && readable instanceof SeekableInputStream) {
72+
if (options.sequentialReadFallback
73+
&& combinedRanges.size() == 1
74+
&& readable instanceof SeekableInputStream) {
6275
fallbackToReadSequence((SeekableInputStream) readable, sortRanges);
6376
return;
6477
}
6578

6679
BlockingExecutor executor = new BlockingExecutor(IO_THREAD_POOL, parallelism);
67-
long batchSize = readable.batchSizeForVectorReads();
80+
long batchSize = options.batchSizeForVectorReads;
6881
for (CombinedRange combinedRange : combinedRanges) {
6982
if (combinedRange.underlying.size() == 1) {
7083
FileRange fileRange = combinedRange.underlying.get(0);
@@ -76,12 +89,95 @@ public static void readVectored(VectoredReadable readable, List<? extends FileRa
7689
List<CompletableFuture<byte[]>> futures =
7790
splitBatches.stream().map(FileRange::getData).collect(Collectors.toList());
7891
CompletableFuture.allOf(futures.toArray(new CompletableFuture<?>[0]))
79-
.thenAcceptAsync(
80-
unused -> copyToFileRanges(combinedRange, futures), IO_THREAD_POOL);
92+
.whenCompleteAsync(
93+
(unused, throwable) -> {
94+
if (throwable == null) {
95+
try {
96+
copyToFileRanges(combinedRange, futures);
97+
} catch (Throwable t) {
98+
completeFileRangesExceptionally(combinedRange, t);
99+
}
100+
} else {
101+
completeFileRangesExceptionally(combinedRange, throwable);
102+
}
103+
},
104+
IO_THREAD_POOL);
81105
}
82106
}
83107
}
84108

109+
/** Options for vectored reads. */
110+
public static class ReadOptions {
111+
112+
private final int minSeekForVectorReads;
113+
private final long batchSizeForVectorReads;
114+
private final int parallelismForVectorReads;
115+
private final boolean sequentialReadFallback;
116+
117+
public static ReadOptions from(VectoredReadable readable) {
118+
return new ReadOptions(
119+
readable.minSeekForVectorReads(),
120+
readable.batchSizeForVectorReads(),
121+
readable.parallelismForVectorReads(),
122+
true);
123+
}
124+
125+
public ReadOptions(
126+
int minSeekForVectorReads,
127+
long batchSizeForVectorReads,
128+
int parallelismForVectorReads,
129+
boolean sequentialReadFallback) {
130+
checkArgument(
131+
minSeekForVectorReads >= 0,
132+
"minSeekForVectorReads must be non-negative: %s",
133+
minSeekForVectorReads);
134+
checkArgument(
135+
batchSizeForVectorReads > 0,
136+
"batchSizeForVectorReads must be positive: %s",
137+
batchSizeForVectorReads);
138+
checkArgument(
139+
parallelismForVectorReads > 0,
140+
"parallelismForVectorReads must be positive: %s",
141+
parallelismForVectorReads);
142+
this.minSeekForVectorReads = minSeekForVectorReads;
143+
this.batchSizeForVectorReads = batchSizeForVectorReads;
144+
this.parallelismForVectorReads = parallelismForVectorReads;
145+
this.sequentialReadFallback = sequentialReadFallback;
146+
}
147+
148+
public ReadOptions withMinSeekForVectorReads(int minSeekForVectorReads) {
149+
return new ReadOptions(
150+
minSeekForVectorReads,
151+
batchSizeForVectorReads,
152+
parallelismForVectorReads,
153+
sequentialReadFallback);
154+
}
155+
156+
public ReadOptions withBatchSizeForVectorReads(long batchSizeForVectorReads) {
157+
return new ReadOptions(
158+
minSeekForVectorReads,
159+
batchSizeForVectorReads,
160+
parallelismForVectorReads,
161+
sequentialReadFallback);
162+
}
163+
164+
public ReadOptions withParallelismForVectorReads(int parallelismForVectorReads) {
165+
return new ReadOptions(
166+
minSeekForVectorReads,
167+
batchSizeForVectorReads,
168+
parallelismForVectorReads,
169+
sequentialReadFallback);
170+
}
171+
172+
public ReadOptions withSequentialReadFallback(boolean sequentialReadFallback) {
173+
return new ReadOptions(
174+
minSeekForVectorReads,
175+
batchSizeForVectorReads,
176+
parallelismForVectorReads,
177+
sequentialReadFallback);
178+
}
179+
}
180+
85181
private static void fallbackToReadSequence(
86182
SeekableInputStream in, List<? extends FileRange> ranges) throws IOException {
87183
for (FileRange range : ranges) {
@@ -126,6 +222,13 @@ private static void copyToFileRanges(
126222
}
127223
}
128224

225+
private static void completeFileRangesExceptionally(
226+
CombinedRange combinedRange, Throwable throwable) {
227+
for (FileRange fileRange : combinedRange.underlying) {
228+
fileRange.getData().completeExceptionally(throwable);
229+
}
230+
}
231+
129232
private static void copyMultiBytesToBytes(
130233
List<byte[]> segments, int offset, byte[] bytes, int numBytes) {
131234
int remainSize = numBytes;

0 commit comments

Comments
 (0)