Skip to content

Commit 5c98e47

Browse files
authored
feat: introduce IndexFileVersion for Python API (#4040)
Signed-off-by: BubbleCal <bubble-cal@outlook.com>
1 parent 5fd8412 commit 5c98e47

7 files changed

Lines changed: 59 additions & 8 deletions

File tree

python/python/lance/dataset.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1947,6 +1947,8 @@ def create_index(
19471947
- num_bits
19481948
The number of bits for PQ (Product Quantization). Default is 8.
19491949
Only 4, 8 are supported.
1950+
- index_file_version
1951+
The version of the index file. Default is "V3".
19501952
19511953
Optional parameters for `IVF_HNSW_*`:
19521954
max_level

python/python/lance/indices.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# SPDX-FileCopyrightText: Copyright The Lance Authors
33
import math
44
import warnings
5+
from enum import Enum
56
from typing import TYPE_CHECKING, Optional, Union
67

78
import numpy as np
@@ -16,6 +17,11 @@
1617
from .dependencies import torch
1718

1819

20+
class IndexFileVersion(str, Enum):
21+
LEGACY = "Legacy"
22+
V3 = "V3"
23+
24+
1925
class PqModel:
2026
"""
2127
A class that represents a trained PQ model

python/python/tests/test_vector_index.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import pytest
1515
from lance import LanceFragment
1616
from lance.dataset import VectorIndexReader
17+
from lance.indices import IndexFileVersion
1718
from lance.util import validate_vector_index # noqa: E402
1819
from lance.vector import vec_to_table # noqa: E402
1920

@@ -238,7 +239,10 @@ def test_f16_cuda(tmp_path):
238239
validate_vector_index(dataset, "vector")
239240

240241

241-
def test_index_with_nans(tmp_path):
242+
@pytest.mark.parametrize(
243+
"index_file_version", [IndexFileVersion.V3, IndexFileVersion.LEGACY]
244+
)
245+
def test_index_with_nans(tmp_path, index_file_version):
242246
# 1024 rows, the entire table should be sampled
243247
tbl = create_table(nvec=1000, nans=24)
244248

@@ -248,11 +252,17 @@ def test_index_with_nans(tmp_path):
248252
index_type="IVF_PQ",
249253
num_partitions=4,
250254
num_sub_vectors=16,
255+
index_file_version=index_file_version,
251256
)
257+
idx_stats = dataset.stats.index_stats("vector_idx")
258+
assert idx_stats["indices"][0]["index_file_version"] == index_file_version
252259
validate_vector_index(dataset, "vector")
253260

254261

255-
def test_torch_index_with_nans(tmp_path):
262+
@pytest.mark.parametrize(
263+
"index_file_version", [IndexFileVersion.V3, IndexFileVersion.LEGACY]
264+
)
265+
def test_torch_index_with_nans(tmp_path, index_file_version):
256266
torch = pytest.importorskip("torch")
257267

258268
# 1024 rows, the entire table should be sampled
@@ -266,7 +276,10 @@ def test_torch_index_with_nans(tmp_path):
266276
num_sub_vectors=16,
267277
accelerator=torch.device("cpu"),
268278
one_pass_ivfpq=True,
279+
index_file_version=index_file_version,
269280
)
281+
idx_stats = dataset.stats.index_stats("vector_idx")
282+
assert idx_stats["indices"][0]["index_file_version"] == index_file_version
270283
validate_vector_index(dataset, "vector")
271284

272285

@@ -658,6 +671,7 @@ def test_pre_populated_ivf_centroids(dataset, tmp_path: Path):
658671
"num_sub_vectors": 8,
659672
"transposed": True,
660673
},
674+
"index_file_version": IndexFileVersion.V3,
661675
}
662676

663677
with pytest.raises(KeyError, match='Index "non-existent_idx" not found'):

python/src/dataset.rs

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ use pyo3::{
3131
use pyo3::{prelude::*, IntoPyObjectExt};
3232
use snafu::location;
3333

34-
use lance::dataset::builder::DatasetBuilder;
3534
use lance::dataset::refs::{Ref, TagContents};
3635
use lance::dataset::scanner::{
3736
DatasetRecordBatchStream, ExecutionStatsCallback, MaterializationStyle,
@@ -54,6 +53,7 @@ use lance::dataset::{
5453
use lance::dataset::{ColumnAlteration, ProjectionRequest};
5554
use lance::index::vector::utils::get_vector_type;
5655
use lance::index::{vector::VectorIndexParams, DatasetIndexInternalExt};
56+
use lance::{dataset::builder::DatasetBuilder, index::vector::IndexFileVersion};
5757
use lance_arrow::as_fixed_size_list_array;
5858
use lance_index::scalar::inverted::query::{
5959
BooleanQuery, BoostQuery, FtsQuery, MatchQuery, MultiMatchQuery, Operator, PhraseQuery,
@@ -2027,6 +2027,7 @@ fn prepare_vector_index_params(
20272027
let mut hnsw_params = HnswBuildParams::default();
20282028
let mut pq_params = PQBuildParams::default();
20292029
let mut sq_params = SQBuildParams::default();
2030+
let mut index_file_version = IndexFileVersion::V3;
20302031

20312032
if let Some(kwargs) = kwargs {
20322033
// Parse metric type
@@ -2143,9 +2144,15 @@ fn prepare_vector_index_params(
21432144
let codebook = as_fixed_size_list_array(batch.column(0));
21442145
pq_params.codebook = Some(codebook.values().clone())
21452146
};
2147+
2148+
if let Some(version) = kwargs.get_item("index_file_version")? {
2149+
let version: String = version.extract()?;
2150+
index_file_version = IndexFileVersion::try_from(&version)
2151+
.map_err(|e| PyValueError::new_err(format!("Invalid index_file_version: {e}")))?;
2152+
}
21462153
}
21472154

2148-
match index_type {
2155+
let mut params = match index_type {
21492156
"IVF_FLAT" => Ok(Box::new(VectorIndexParams::ivf_flat(
21502157
ivf_params.num_partitions,
21512158
m_type,
@@ -2178,7 +2185,9 @@ fn prepare_vector_index_params(
21782185
_ => Err(PyValueError::new_err(format!(
21792186
"Index type '{index_type}' is not supported."
21802187
))),
2181-
}
2188+
}?;
2189+
params.version(index_file_version);
2190+
Ok(params)
21822191
}
21832192

21842193
#[pyclass(name = "_FragmentWriteProgress", module = "_lib")]

rust/lance/src/index/vector.rs

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ use lance_io::traits::Reader;
4141
use lance_linalg::distance::*;
4242
use lance_table::format::Index as IndexMetadata;
4343
use object_store::path::Path;
44+
use serde::Serialize;
4445
use snafu::location;
4546
use tempfile::tempdir;
4647
use tracing::instrument;
@@ -64,12 +65,25 @@ pub enum StageParams {
6465
// The version of the index file.
6566
// `Legacy` is used for only IVF_PQ index, and is the default value.
6667
// The other index types are using `V3`.
67-
#[derive(Debug, Clone)]
68+
#[derive(Debug, Clone, Serialize)]
6869
pub enum IndexFileVersion {
6970
Legacy,
7071
V3,
7172
}
7273

74+
impl IndexFileVersion {
75+
pub fn try_from(version: &str) -> Result<Self> {
76+
match version.to_lowercase().as_str() {
77+
"legacy" => Ok(Self::Legacy),
78+
"v3" => Ok(Self::V3),
79+
_ => Err(Error::Index {
80+
message: format!("Invalid index file version: {}", version),
81+
location: location!(),
82+
}),
83+
}
84+
}
85+
}
86+
7387
/// The parameters to build vector index.
7488
#[derive(Debug, Clone)]
7589
pub struct VectorIndexParams {

rust/lance/src/index/vector/ivf.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@ use super::{
1414
pq::{build_pq_model, PQIndex},
1515
utils::maybe_sample_training_data,
1616
};
17-
use crate::dataset::builder::DatasetBuilder;
1817
use crate::index::vector::utils::{get_vector_dim, get_vector_type};
1918
use crate::index::DatasetIndexInternalExt;
19+
use crate::{dataset::builder::DatasetBuilder, index::vector::IndexFileVersion};
2020
use crate::{
2121
dataset::Dataset,
2222
index::{pb, prefilter::PreFilter, vector::ivf::io::write_pq_partitions, INDEX_FILE_NAME},
@@ -775,6 +775,7 @@ pub struct IvfIndexStatistics {
775775
partitions: Vec<IvfIndexPartitionStatistics>,
776776
centroids: Vec<Vec<f32>>,
777777
loss: Option<f64>,
778+
index_file_version: IndexFileVersion,
778779
}
779780

780781
fn centroids_to_vectors(centroids: &FixedSizeListArray) -> Result<Vec<Vec<f32>>> {
@@ -880,6 +881,7 @@ impl Index for IVFIndex {
880881
partitions: partitions_statistics,
881882
centroids: centroid_vecs,
882883
loss: self.ivf.loss(),
884+
index_file_version: IndexFileVersion::Legacy,
883885
})?)
884886
}
885887

rust/lance/src/index/vector/ivf/v2.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,10 @@ use std::{
1010
sync::{Arc, Weak},
1111
};
1212

13-
use crate::index::vector::builder::{index_type_string, IvfIndexBuilder};
13+
use crate::index::vector::{
14+
builder::{index_type_string, IvfIndexBuilder},
15+
IndexFileVersion,
16+
};
1417
use crate::{
1518
index::{
1619
vector::{utils::PartitionLoadLock, VectorIndex},
@@ -412,6 +415,7 @@ impl<S: IvfSubIndex + 'static, Q: Quantization + 'static> Index for IVFIndex<S,
412415
partitions: partitions_statistics,
413416
centroids: centroid_vecs,
414417
loss: self.ivf.loss(),
418+
index_file_version: IndexFileVersion::V3,
415419
})?)
416420
}
417421

0 commit comments

Comments
 (0)