Skip to content

Commit c38f538

Browse files
authored
feat: ignore indices created in newer format (#3867)
1 parent 22dafb4 commit c38f538

12 files changed

Lines changed: 169 additions & 35 deletions

File tree

protos/table.proto

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,9 @@ message IndexMetadata {
194194
/// Indices should avoid putting large amounts of information in this field, as it will
195195
/// bloat the manifest.
196196
google.protobuf.Any index_details = 6;
197+
198+
/// The minimum lance version that this index is compatible with.
199+
optional int32 index_version = 7;
197200
}
198201

199202
// Index Section, containing a list of index metadata for one dataset version.

python/python/lance/dataset.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3099,6 +3099,7 @@ class CreateIndex(BaseOperation):
30993099
fields: List[int]
31003100
dataset_version: int
31013101
fragment_ids: Set[int]
3102+
index_version: int
31023103

31033104
@dataclass
31043105
class DataReplacementGroup:

python/python/tests/test_commit_index.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ def test_commit_index(dataset_with_index, test_table, tmp_path):
6262
[field_idx],
6363
dataset_without_index.version,
6464
set([f.fragment_id for f in dataset_without_index.get_fragments()]),
65+
0,
6566
)
6667
dataset_without_index = lance.LanceDataset.commit(
6768
dataset_without_index.uri,

python/src/transaction.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ impl FromPyObject<'_> for PyLance<Operation> {
128128
let name = ob.getattr("name")?.extract()?;
129129
let fields = ob.getattr("fields")?.extract()?;
130130
let dataset_version = ob.getattr("dataset_version")?.extract()?;
131-
131+
let index_version = ob.getattr("index_version")?.extract()?;
132132
let fragment_ids = ob.getattr("fragment_ids")?;
133133
let fragment_ids_ref: &Bound<'_, PySet> = fragment_ids.downcast()?;
134134
let fragment_ids = fragment_ids_ref
@@ -147,6 +147,7 @@ impl FromPyObject<'_> for PyLance<Operation> {
147147
// TODO: we should use lance::dataset::Dataset::commit_existing_index once
148148
// we have a way to determine index details from an existing index.
149149
index_details: None,
150+
index_version,
150151
}];
151152

152153
let op = Operation::CreateIndex {

rust/lance-index/src/lib.rs

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,31 @@ impl IndexType {
168168
| Self::IvfSq
169169
)
170170
}
171+
172+
/// Returns the current format version of the index type,
173+
/// bump this when the index format changes.
174+
/// Indices which higher version than these will be ignored for compatibility,
175+
/// This would happen when creating index in a newer version of Lance,
176+
/// but then opening the index in older version of Lance
177+
pub fn version(&self) -> i32 {
178+
match self {
179+
Self::Scalar => 0,
180+
Self::BTree => 0,
181+
Self::Bitmap => 0,
182+
Self::LabelList => 0,
183+
Self::Inverted => 0,
184+
Self::NGram => 0,
185+
186+
// for now all vector indices are built by the same builder,
187+
// so they share the same version.
188+
Self::Vector
189+
| Self::IvfFlat
190+
| Self::IvfSq
191+
| Self::IvfPq
192+
| Self::IvfHnswSq
193+
| Self::IvfHnswPq => 0,
194+
}
195+
}
171196
}
172197

173198
pub trait IndexParams: Send + Sync {

rust/lance-index/src/scalar.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,18 @@ impl TryFrom<IndexType> for ScalarIndexType {
6767
}
6868
}
6969

70+
impl From<ScalarIndexType> for IndexType {
71+
fn from(val: ScalarIndexType) -> Self {
72+
match val {
73+
ScalarIndexType::BTree => Self::BTree,
74+
ScalarIndexType::Bitmap => Self::Bitmap,
75+
ScalarIndexType::LabelList => Self::LabelList,
76+
ScalarIndexType::NGram => Self::NGram,
77+
ScalarIndexType::Inverted => Self::Inverted,
78+
}
79+
}
80+
}
81+
7082
#[derive(Default)]
7183
pub struct ScalarIndexParams {
7284
/// If set then always use the given index type and skip auto-detection

rust/lance-table/src/format/index.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ use uuid::Uuid;
1010

1111
use super::pb;
1212
use lance_core::{Error, Result};
13-
1413
/// Index metadata
1514
#[derive(Debug, Clone, PartialEq)]
1615
pub struct Index {
@@ -36,6 +35,9 @@ pub struct Index {
3635
/// This is an Option because older versions of Lance may not have this defined. However, it should always
3736
/// be present in newer versions.
3837
pub index_details: Option<prost_types::Any>,
38+
39+
/// The index version.
40+
pub index_version: i32,
3941
}
4042

4143
impl DeepSizeOf for Index {
@@ -76,6 +78,7 @@ impl TryFrom<pb::IndexMetadata> for Index {
7678
dataset_version: proto.dataset_version,
7779
fragment_bitmap,
7880
index_details: proto.index_details,
81+
index_version: proto.index_version.unwrap_or_default(),
7982
})
8083
}
8184
}
@@ -99,6 +102,7 @@ impl From<&Index> for pb::IndexMetadata {
99102
dataset_version: idx.dataset_version,
100103
fragment_bitmap,
101104
index_details: idx.index_details.clone(),
105+
index_version: Some(idx.index_version),
102106
}
103107
}
104108
}

rust/lance/src/dataset/transaction.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2510,6 +2510,7 @@ mod tests {
25102510
dataset_version: 1,
25112511
fragment_bitmap: None,
25122512
index_details: None,
2513+
index_version: 0,
25132514
};
25142515
let fragment0 = Fragment::new(0);
25152516
let fragment1 = Fragment::new(1);

rust/lance/src/index.rs

Lines changed: 44 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,8 @@ use lance_table::format::{Fragment, SelfDescribingFileReader};
5656
use lance_table::io::manifest::read_manifest_indexes;
5757
use roaring::RoaringBitmap;
5858
use scalar::{
59-
build_inverted_index, detect_scalar_index_type, index_matches_criteria, inverted_index_details,
60-
TrainingRequest,
59+
build_inverted_index, detect_scalar_index_type, index_matches_criteria, infer_index_type,
60+
inverted_index_details, TrainingRequest,
6161
};
6262
use serde_json::json;
6363
use snafu::location;
@@ -281,7 +281,7 @@ impl DatasetIndexExt for Dataset {
281281
}
282282

283283
let index_id = Uuid::new_v4();
284-
let index_details: prost_types::Any = match (index_type, params.index_name()) {
284+
let index_details = match (index_type, params.index_name()) {
285285
(
286286
IndexType::Bitmap
287287
| IndexType::BTree
@@ -388,6 +388,7 @@ impl DatasetIndexExt for Dataset {
388388
dataset_version: self.manifest.version,
389389
fragment_bitmap: Some(self.get_fragments().iter().map(|f| f.id() as u32).collect()),
390390
index_details: Some(index_details),
391+
index_version: index_type.version(),
391392
};
392393
let transaction = Transaction::new(
393394
self.manifest.version,
@@ -456,10 +457,27 @@ impl DatasetIndexExt for Dataset {
456457
return Ok(indices);
457458
}
458459

459-
let loaded_indices: Arc<Vec<IndexMetadata>> =
460+
let loaded_indices: Vec<IndexMetadata> =
460461
read_manifest_indexes(&self.object_store, &self.manifest_location, &self.manifest)
461462
.await?
462-
.into();
463+
.into_iter()
464+
.filter(|idx| {
465+
let max_valid_version = infer_index_type(idx)
466+
.map(|t| t.version())
467+
.unwrap_or_default();
468+
let is_valid = idx.index_version <= max_valid_version;
469+
if !is_valid {
470+
log::warn!(
471+
"Index {} has version {}, which is not supported (<={}), ignoring it",
472+
idx.name,
473+
idx.index_version,
474+
max_valid_version,
475+
);
476+
}
477+
is_valid
478+
})
479+
.collect();
480+
let loaded_indices = Arc::new(loaded_indices);
463481

464482
self.session.index_cache.insert_metadata(
465483
self.base.as_ref(),
@@ -492,6 +510,7 @@ impl DatasetIndexExt for Dataset {
492510
dataset_version: self.manifest.version,
493511
fragment_bitmap: Some(self.get_fragments().iter().map(|f| f.id() as u32).collect()),
494512
index_details: None,
513+
index_version: 0,
495514
};
496515

497516
let transaction = Transaction::new(
@@ -574,28 +593,25 @@ impl DatasetIndexExt for Dataset {
574593
let mut new_indices = vec![];
575594
let mut removed_indices = vec![];
576595
for deltas in name_to_indices.values() {
577-
let Some((new_id, removed, mut new_frag_ids)) =
578-
merge_indices(dataset.clone(), deltas.as_slice(), options).await?
596+
let Some(res) = merge_indices(dataset.clone(), deltas.as_slice(), options).await?
579597
else {
580598
continue;
581599
};
582-
for removed_idx in removed.iter() {
583-
new_frag_ids |= removed_idx.fragment_bitmap.as_ref().unwrap();
584-
}
585600

586601
let last_idx = deltas.last().expect("Delta indices should not be empty");
587602
let new_idx = IndexMetadata {
588-
uuid: new_id,
603+
uuid: res.new_uuid,
589604
name: last_idx.name.clone(), // Keep the same name
590605
fields: last_idx.fields.clone(),
591606
dataset_version: self.manifest.version,
592-
fragment_bitmap: Some(new_frag_ids),
607+
fragment_bitmap: Some(res.new_fragment_bitmap),
593608
index_details: last_idx.index_details.clone(),
609+
index_version: res.new_index_version,
594610
};
595-
removed_indices.extend(removed.iter().map(|&idx| idx.clone()));
596-
if deltas.len() > removed.len() {
611+
removed_indices.extend(res.removed_indices.iter().map(|&idx| idx.clone()));
612+
if deltas.len() > removed_indices.len() {
597613
new_indices.extend(
598-
deltas[0..(deltas.len() - removed.len())]
614+
deltas[0..(deltas.len() - res.removed_indices.len())]
599615
.iter()
600616
.map(|&idx| idx.clone()),
601617
);
@@ -1104,7 +1120,8 @@ impl DatasetIndexInternalExt for Dataset {
11041120
let is_vector_index = idx_schema
11051121
.fields
11061122
.iter()
1107-
.any(|f| matches!(f.data_type(), DataType::FixedSizeList(_, _)));
1123+
.any(|f| is_vector_field(f.data_type()));
1124+
11081125
idx.fields.len() == 1 && !is_vector_index
11091126
}) {
11101127
let field = index.fields[0];
@@ -1208,6 +1225,17 @@ impl DatasetIndexInternalExt for Dataset {
12081225
}
12091226
}
12101227

1228+
fn is_vector_field(data_type: DataType) -> bool {
1229+
match data_type {
1230+
DataType::FixedSizeList(_, _) => true,
1231+
DataType::List(inner) => {
1232+
// If the inner type is a fixed size list, then it is a multivector field
1233+
matches!(inner.data_type(), DataType::FixedSizeList(_, _))
1234+
}
1235+
_ => false,
1236+
}
1237+
}
1238+
12111239
#[cfg(test)]
12121240
mod tests {
12131241
use crate::dataset::builder::DatasetBuilder;

rust/lance/src/index/append.rs

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,13 @@ use crate::dataset::index::LanceIndexStoreExt;
1919
use crate::dataset::scanner::ColumnOrdering;
2020
use crate::dataset::Dataset;
2121

22+
pub struct IndexMergeResults<'a> {
23+
pub new_uuid: Uuid,
24+
pub removed_indices: Vec<&'a IndexMetadata>,
25+
pub new_fragment_bitmap: RoaringBitmap,
26+
pub new_index_version: i32,
27+
}
28+
2229
/// Merge in-inflight unindexed data, with a specific number of previous indices
2330
/// into a new index, to improve the query performance.
2431
///
@@ -33,7 +40,7 @@ pub async fn merge_indices<'a>(
3340
dataset: Arc<Dataset>,
3441
old_indices: &[&'a IndexMetadata],
3542
options: &OptimizeOptions,
36-
) -> Result<Option<(Uuid, Vec<&'a IndexMetadata>, RoaringBitmap)>> {
43+
) -> Result<Option<IndexMergeResults<'a>>> {
3744
if old_indices.is_empty() {
3845
return Err(Error::Index {
3946
message: "Append index: no previous index found".to_string(),
@@ -76,7 +83,8 @@ pub async fn merge_indices<'a>(
7683
frag_bitmap.insert(frag.id as u32);
7784
});
7885

79-
let (new_uuid, indices_merged) = match indices[0].index_type() {
86+
let index_type = indices[0].index_type();
87+
let (new_uuid, indices_merged) = match index_type {
8088
it if it.is_scalar() => {
8189
// There are no delta indices for scalar, so adding all indexed
8290
// fragments to the new index.
@@ -171,11 +179,17 @@ pub async fn merge_indices<'a>(
171179
}),
172180
}?;
173181

174-
Ok(Some((
182+
let removed_indices = old_indices[old_indices.len() - indices_merged..].to_vec();
183+
for removed in removed_indices.iter() {
184+
frag_bitmap |= removed.fragment_bitmap.as_ref().unwrap();
185+
}
186+
187+
Ok(Some(IndexMergeResults {
175188
new_uuid,
176-
old_indices[old_indices.len() - indices_merged..].to_vec(),
177-
frag_bitmap,
178-
)))
189+
removed_indices,
190+
new_fragment_bitmap: frag_bitmap,
191+
new_index_version: index_type.version(),
192+
}))
179193
}
180194

181195
#[cfg(test)]

0 commit comments

Comments
 (0)