Skip to content

Commit 5a3cfa1

Browse files
committed
segment offset tests
Signed-off-by: Onur Satici <onur@spiraldb.com>
1 parent f8c9c4d commit 5a3cfa1

1 file changed

Lines changed: 175 additions & 0 deletions

File tree

vortex-file/src/tests.rs

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ use vortex_buffer::ByteBufferMut;
6262
use vortex_buffer::buffer;
6363
use vortex_error::VortexResult;
6464
use vortex_io::session::RuntimeSession;
65+
use vortex_layout::Layout;
6566
use vortex_layout::scan::scan_builder::ScanBuilder;
6667
use vortex_layout::session::LayoutSession;
6768
use vortex_session::VortexSession;
@@ -71,6 +72,7 @@ use crate::V1_FOOTER_FBS_SIZE;
7172
use crate::VERSION;
7273
use crate::VortexFile;
7374
use crate::WriteOptionsSessionExt;
75+
use crate::footer::SegmentSpec;
7476

7577
static SESSION: LazyLock<VortexSession> = LazyLock::new(|| {
7678
let mut session = VortexSession::empty()
@@ -1696,3 +1698,176 @@ async fn timestamp_unit_mismatch_errors_with_constant_children()
16961698

16971699
Ok(())
16981700
}
1701+
1702+
/// Collect all segment byte offsets reachable from a layout node.
1703+
fn collect_segment_offsets(layout: &dyn Layout, segment_specs: &[SegmentSpec]) -> Vec<u64> {
1704+
let mut result = Vec::new();
1705+
collect_segment_offsets_inner(layout, segment_specs, &mut result);
1706+
result
1707+
}
1708+
1709+
fn collect_segment_offsets_inner(
1710+
layout: &dyn Layout,
1711+
segment_specs: &[SegmentSpec],
1712+
result: &mut Vec<u64>,
1713+
) {
1714+
for seg_id in layout.segment_ids() {
1715+
result.push(segment_specs[*seg_id as usize].offset);
1716+
}
1717+
for child in layout.children().unwrap() {
1718+
collect_segment_offsets_inner(child.as_ref(), segment_specs, result);
1719+
}
1720+
}
1721+
1722+
/// Assert that all offsets in `before` are less than all offsets in `after`.
1723+
fn assert_offsets_ordered(before: &[u64], after: &[u64], context: &str) {
1724+
if let (Some(&max_before), Some(&min_after)) = (before.iter().max(), after.iter().min()) {
1725+
assert!(
1726+
max_before < min_after,
1727+
"{context}: expected all 'before' offsets < all 'after' offsets, \
1728+
but max before = {max_before} >= min after = {min_after}"
1729+
);
1730+
}
1731+
}
1732+
1733+
#[tokio::test]
1734+
#[cfg_attr(miri, ignore)]
1735+
async fn test_segment_ordering_dict_codes_before_values() -> VortexResult<()> {
1736+
// Create low-cardinality strings to trigger dict encoding, plus an integer column.
1737+
let n = 100_000;
1738+
let values: Vec<&str> = (0..n).map(|i| ["alpha", "beta", "gamma"][i % 3]).collect();
1739+
let strings = VarBinArray::from(values).into_array();
1740+
let numbers = PrimitiveArray::from_iter(0..n as i32).into_array();
1741+
1742+
let st = StructArray::from_fields(&[("strings", strings), ("numbers", numbers)]).unwrap();
1743+
1744+
let mut buf = ByteBufferMut::empty();
1745+
let summary = SESSION
1746+
.write_options()
1747+
.write(&mut buf, st.to_array_stream())
1748+
.await?;
1749+
1750+
let footer = summary.footer();
1751+
let segment_specs = footer.segment_map();
1752+
let root = footer.layout();
1753+
1754+
// Walk the layout tree and find all dict layouts.
1755+
// Verify codes segments come before values segments in byte order within each run.
1756+
fn check_dict_ordering(layout: &dyn Layout, segment_specs: &[SegmentSpec]) {
1757+
if layout.encoding_id().as_ref() == "vortex.dict" {
1758+
// child 0 = values, child 1 = codes
1759+
let values_offsets =
1760+
collect_segment_offsets(layout.child(0).unwrap().as_ref(), segment_specs);
1761+
let codes_offsets =
1762+
collect_segment_offsets(layout.child(1).unwrap().as_ref(), segment_specs);
1763+
1764+
assert_offsets_ordered(
1765+
&codes_offsets,
1766+
&values_offsets,
1767+
"dict: codes should come before values",
1768+
);
1769+
}
1770+
1771+
for child in layout.children().unwrap() {
1772+
check_dict_ordering(child.as_ref(), segment_specs);
1773+
}
1774+
}
1775+
1776+
check_dict_ordering(root.as_ref(), segment_specs);
1777+
1778+
Ok(())
1779+
}
1780+
1781+
#[tokio::test]
1782+
#[cfg_attr(miri, ignore)]
1783+
async fn test_segment_ordering_zonemaps_after_data() -> VortexResult<()> {
1784+
// Create a multi-column struct with enough rows to produce zone maps.
1785+
let n = 100_000;
1786+
let values: Vec<&str> = (0..n).map(|i| ["alpha", "beta", "gamma"][i % 3]).collect();
1787+
let strings = VarBinArray::from(values).into_array();
1788+
let numbers = PrimitiveArray::from_iter(0..n as i32).into_array();
1789+
let floats = PrimitiveArray::from_iter((0..n).map(|i| i as f64 * 0.1)).into_array();
1790+
1791+
let st = StructArray::from_fields(&[
1792+
("strings", strings),
1793+
("numbers", numbers),
1794+
("floats", floats),
1795+
])
1796+
.unwrap();
1797+
1798+
let mut buf = ByteBufferMut::empty();
1799+
let summary = SESSION
1800+
.write_options()
1801+
.write(&mut buf, st.to_array_stream())
1802+
.await?;
1803+
1804+
let footer = summary.footer();
1805+
let segment_specs = footer.segment_map();
1806+
let root = footer.layout();
1807+
1808+
// Find all zoned layouts and verify data segments come before zone map segments.
1809+
fn check_zoned_ordering(layout: &dyn Layout, segment_specs: &[SegmentSpec]) {
1810+
if layout.encoding_id().as_ref() == "vortex.stats" {
1811+
// child 0 = data, child 1 = zones
1812+
let data_offsets =
1813+
collect_segment_offsets(layout.child(0).unwrap().as_ref(), segment_specs);
1814+
let zones_offsets =
1815+
collect_segment_offsets(layout.child(1).unwrap().as_ref(), segment_specs);
1816+
1817+
assert_offsets_ordered(
1818+
&data_offsets,
1819+
&zones_offsets,
1820+
"zoned: data should come before zones",
1821+
);
1822+
}
1823+
1824+
for child in layout.children().unwrap() {
1825+
check_zoned_ordering(child.as_ref(), segment_specs);
1826+
}
1827+
}
1828+
1829+
check_zoned_ordering(root.as_ref(), segment_specs);
1830+
1831+
// Additionally: all zone map segments across all columns should appear after
1832+
// all data segments across all columns.
1833+
let mut all_data_offsets = Vec::new();
1834+
let mut all_zones_offsets = Vec::new();
1835+
1836+
fn collect_all_zoned(
1837+
layout: &dyn Layout,
1838+
segment_specs: &[SegmentSpec],
1839+
all_data: &mut Vec<u64>,
1840+
all_zones: &mut Vec<u64>,
1841+
) {
1842+
if layout.encoding_id().as_ref() == "vortex.stats" {
1843+
// child 0 = data, child 1 = zones
1844+
all_data.extend(collect_segment_offsets(
1845+
layout.child(0).unwrap().as_ref(),
1846+
segment_specs,
1847+
));
1848+
all_zones.extend(collect_segment_offsets(
1849+
layout.child(1).unwrap().as_ref(),
1850+
segment_specs,
1851+
));
1852+
return;
1853+
}
1854+
for child in layout.children().unwrap() {
1855+
collect_all_zoned(child.as_ref(), segment_specs, all_data, all_zones);
1856+
}
1857+
}
1858+
1859+
collect_all_zoned(
1860+
root.as_ref(),
1861+
segment_specs,
1862+
&mut all_data_offsets,
1863+
&mut all_zones_offsets,
1864+
);
1865+
1866+
assert_offsets_ordered(
1867+
&all_data_offsets,
1868+
&all_zones_offsets,
1869+
"global: all data segments should come before all zone map segments",
1870+
);
1871+
1872+
Ok(())
1873+
}

0 commit comments

Comments
 (0)