Skip to content

Commit 3c35f48

Browse files
westonpaceclaudewjones127
authored
feat: change default file format version to 2.1 (#6115)
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: Will Jones <willjones127@gmail.com>
1 parent 507dd6f commit 3c35f48

14 files changed

Lines changed: 231 additions & 40 deletions

File tree

java/src/test/java/org/lance/DatasetTest.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -123,12 +123,12 @@ void testCreateEmptyDataset(@TempDir Path tempDir) {
123123
@Test
124124
void testGetLanceFileFormatVersion(@TempDir Path tempDir) {
125125
try (RootAllocator allocator = new RootAllocator(Long.MAX_VALUE)) {
126-
// Test default version (V2_0)
126+
// Test default version (V2_1)
127127
String defaultPath = tempDir.resolve("default_version").toString();
128128
TestUtils.SimpleTestDataset testDataset =
129129
new TestUtils.SimpleTestDataset(allocator, defaultPath);
130130
try (Dataset dataset = testDataset.createEmptyDataset()) {
131-
assertEquals(LanceConstants.FILE_FORMAT_VERSION_2_0, dataset.getLanceFileFormatVersion());
131+
assertEquals(LanceConstants.FILE_FORMAT_VERSION_2_1, dataset.getLanceFileFormatVersion());
132132
}
133133

134134
// Test LEGACY version
@@ -869,7 +869,7 @@ void testCalculateDataSize(@TempDir Path tempDir) {
869869
dataset = testDataset.createEmptyDataset();
870870

871871
try (Dataset dataset2 = testDataset.write(1, 5)) {
872-
assertEquals(100, dataset2.calculateDataSize());
872+
assertEquals(108, dataset2.calculateDataSize());
873873
}
874874
}
875875
}

java/src/test/java/org/lance/operation/MergeTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,7 @@ void testMergeNewColumnWithNonContiguousFieldId(@TempDir Path tempDir) throws Ex
246246
datasetPath,
247247
addressRoot,
248248
new int[] {addressFieldId, cityFieldId, countryFieldId},
249-
new int[] {0, 1, 2});
249+
new int[] {-1, 0, 1});
250250

251251
FragmentMetadata fragmentMeta = initialDataset.getFragment(0).metadata();
252252
List<DataFile> dataFiles = fragmentMeta.getFiles();

java/src/test/java/org/lance/operation/OperationTestBase.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
public class OperationTestBase {
3636

3737
public static final int TEST_FILE_FORMAT_MAJOR_VERSION = 2;
38-
public static final int TEST_FILE_FORMAT_MINOR_VERSION = 0;
38+
public static final int TEST_FILE_FORMAT_MINOR_VERSION = 1;
3939
protected Dataset dataset;
4040

4141
@BeforeAll

python/python/tests/compat/test_scalar_indices.py

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
UpgradeDowngradeTest,
2020
compat_test,
2121
)
22+
from .util import safe_data_storage_version
2223

2324

2425
@compat_test(min_version="0.30.0")
@@ -40,7 +41,12 @@ def create(self):
4041
"btree": pa.array(range(1000)),
4142
}
4243
)
43-
dataset = lance.write_dataset(data, self.path, max_rows_per_file=100)
44+
dataset = lance.write_dataset(
45+
data,
46+
self.path,
47+
max_rows_per_file=100,
48+
data_storage_version=safe_data_storage_version(self.compat_version),
49+
)
4450
dataset.create_scalar_index("btree", "BTREE")
4551

4652
def check_read(self):
@@ -92,7 +98,12 @@ def create(self):
9298
"label_list": pa.array([[f"label{i}"] for i in range(1000)]),
9399
}
94100
)
95-
dataset = lance.write_dataset(data, self.path, max_rows_per_file=100)
101+
dataset = lance.write_dataset(
102+
data,
103+
self.path,
104+
max_rows_per_file=100,
105+
data_storage_version=safe_data_storage_version(self.compat_version),
106+
)
96107
dataset.create_scalar_index("bitmap", "BITMAP")
97108
dataset.create_scalar_index("label_list", "LABEL_LIST")
98109

@@ -141,7 +152,12 @@ def create(self):
141152
"ngram": pa.array([f"word{i}" for i in range(1000)]),
142153
}
143154
)
144-
dataset = lance.write_dataset(data, self.path, max_rows_per_file=100)
155+
dataset = lance.write_dataset(
156+
data,
157+
self.path,
158+
max_rows_per_file=100,
159+
data_storage_version=safe_data_storage_version(self.compat_version),
160+
)
145161
dataset.create_scalar_index("ngram", "NGRAM")
146162

147163
def check_read(self):
@@ -186,7 +202,12 @@ def create(self):
186202
"bloomfilter": pa.array(range(1000)),
187203
}
188204
)
189-
dataset = lance.write_dataset(data, self.path, max_rows_per_file=100)
205+
dataset = lance.write_dataset(
206+
data,
207+
self.path,
208+
max_rows_per_file=100,
209+
data_storage_version=safe_data_storage_version(self.compat_version),
210+
)
190211
dataset.create_scalar_index("zonemap", "ZONEMAP")
191212
dataset.create_scalar_index("bloomfilter", "BLOOMFILTER")
192213

@@ -237,7 +258,12 @@ def create(self):
237258
"json": pa.array([f'{{"val": {i}}}' for i in range(1000)], pa.json_()),
238259
}
239260
)
240-
dataset = lance.write_dataset(data, self.path, max_rows_per_file=100)
261+
dataset = lance.write_dataset(
262+
data,
263+
self.path,
264+
max_rows_per_file=100,
265+
data_storage_version=safe_data_storage_version(self.compat_version),
266+
)
241267
dataset.create_scalar_index(
242268
"json",
243269
IndexConfig(
@@ -288,7 +314,12 @@ def create(self):
288314
),
289315
}
290316
)
291-
dataset = lance.write_dataset(data, self.path, max_rows_per_file=100)
317+
dataset = lance.write_dataset(
318+
data,
319+
self.path,
320+
max_rows_per_file=100,
321+
data_storage_version=safe_data_storage_version(self.compat_version),
322+
)
292323
dataset.create_scalar_index("text", "INVERTED", with_position=True)
293324

294325
def check_read(self):

python/python/tests/compat/test_vector_indices.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
UpgradeDowngradeTest,
2121
compat_test,
2222
)
23+
from .util import safe_data_storage_version
2324

2425

2526
@compat_test(min_version="0.29.1.beta2")
@@ -44,7 +45,11 @@ def create(self):
4445
}
4546
)
4647

47-
dataset = lance.write_dataset(data, self.path)
48+
dataset = lance.write_dataset(
49+
data,
50+
self.path,
51+
data_storage_version=safe_data_storage_version(self.compat_version),
52+
)
4853
dataset.create_index(
4954
"vec",
5055
"IVF_PQ",
@@ -109,7 +114,11 @@ def create(self):
109114
}
110115
)
111116

112-
dataset = lance.write_dataset(data, self.path)
117+
dataset = lance.write_dataset(
118+
data,
119+
self.path,
120+
data_storage_version=safe_data_storage_version(self.compat_version),
121+
)
113122
dataset.create_index(
114123
"vec",
115124
"IVF_HNSW_PQ",
@@ -174,7 +183,11 @@ def create(self):
174183
}
175184
)
176185

177-
dataset = lance.write_dataset(data, self.path)
186+
dataset = lance.write_dataset(
187+
data,
188+
self.path,
189+
data_storage_version=safe_data_storage_version(self.compat_version),
190+
)
178191
dataset.create_index(
179192
"vec",
180193
"IVF_HNSW_SQ",
@@ -235,7 +248,11 @@ def create(self):
235248
}
236249
)
237250

238-
dataset = lance.write_dataset(data, self.path)
251+
dataset = lance.write_dataset(
252+
data,
253+
self.path,
254+
data_storage_version=safe_data_storage_version(self.compat_version),
255+
)
239256
dataset.create_index(
240257
"vec",
241258
"IVF_RQ",

python/python/tests/compat/util.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,21 @@
99
import pyarrow as pa
1010

1111

12+
def safe_data_storage_version(version_str):
13+
"""Return a data_storage_version safe for the given lance version.
14+
15+
Versions 0.30 and older use "2.0", newer versions use "stable".
16+
"""
17+
parts = version_str.split(".")
18+
major = int(parts[0])
19+
if major > 0:
20+
return "stable"
21+
minor = int(parts[1]) if len(parts) > 1 else 0
22+
if minor <= 30:
23+
return "2.0"
24+
return "stable"
25+
26+
1227
def build_basic_types():
1328
schema = pa.schema(
1429
[

python/python/tests/test_file.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -243,9 +243,10 @@ def test_metadata(tmp_path):
243243
assert len(column.pages) == 1
244244

245245
page = column.pages[0]
246-
assert len(page.buffers) == 1
247-
assert page.buffers[0].position == 0
248-
assert page.buffers[0].size == 24
246+
assert len(page.buffers) > 0
247+
for buffer in page.buffers:
248+
assert buffer.position % 64 == 0
249+
assert buffer.size > 0
249250

250251
assert len(page.encoding) > 0
251252

python/python/tests/test_optimize.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -74,28 +74,30 @@ def test_optimize_max_bytes(tmp_path: Path):
7474
arr = pa.FixedSizeListArray.from_arrays(arr, 1024)
7575
data = pa.table({"a": arr})
7676

77+
# Write out 4K rows and 32MB of data
7778
dataset = lance.write_dataset(
7879
data, base_dir, max_rows_per_file=2 * 1024, data_storage_version="stable"
7980
)
81+
# We get 2 fragments
8082
assert len(dataset.get_fragments()) == 2
8183

82-
# max_bytes_per_file is too small and we get tiny files
84+
# Now run compaction with a small max_bytes_per_file (1000 bytes) to get more
85+
# fragments. The exact number is a bit tricky to calculate because we don't
86+
# split into a new fragment until we've actually written data and that depends
87+
# on how much the file format chooses to accumulate, but it should be more than 2
8388
metrics = dataset.optimize.compact_files(
8489
target_rows_per_fragment=100 * 1024,
8590
materialize_deletions=False,
8691
max_bytes_per_file=1000,
8792
batch_size=128,
8893
)
89-
90-
# We get 4 fragments here because we don't actually write any data to the file
91-
# until we've accumulated 8MiB for a page.
9294
assert metrics.fragments_removed == 2
93-
assert metrics.fragments_added == 4
95+
assert metrics.fragments_added > 2
9496
assert metrics.files_removed == 2
95-
assert metrics.files_added == 4
97+
assert metrics.files_added > 2
9698

9799
num_frags = len(dataset.get_fragments())
98-
assert num_frags == 4
100+
assert num_frags == metrics.fragments_added
99101

100102
dataset = lance.write_dataset(
101103
data,
@@ -115,9 +117,9 @@ def test_optimize_max_bytes(tmp_path: Path):
115117
results = [task.execute(dataset) for task in plan.tasks]
116118
metrics = Compaction.commit(dataset, results)
117119
assert metrics.fragments_removed == 2
118-
assert metrics.fragments_added == 4
120+
assert metrics.fragments_added > 2
119121
assert metrics.files_removed == 2
120-
assert metrics.files_added == 4
122+
assert metrics.files_added > 2
121123

122124
dataset = lance.write_dataset(
123125
data,

0 commit comments

Comments
 (0)