@@ -62,6 +62,7 @@ use vortex_buffer::ByteBufferMut;
6262use vortex_buffer:: buffer;
6363use vortex_error:: VortexResult ;
6464use vortex_io:: session:: RuntimeSession ;
65+ use vortex_layout:: Layout ;
6566use vortex_layout:: scan:: scan_builder:: ScanBuilder ;
6667use vortex_layout:: session:: LayoutSession ;
6768use vortex_session:: VortexSession ;
@@ -71,6 +72,7 @@ use crate::V1_FOOTER_FBS_SIZE;
7172use crate :: VERSION ;
7273use crate :: VortexFile ;
7374use crate :: WriteOptionsSessionExt ;
75+ use crate :: footer:: SegmentSpec ;
7476
7577static SESSION : LazyLock < VortexSession > = LazyLock :: new ( || {
7678 let mut session = VortexSession :: empty ( )
@@ -1696,3 +1698,176 @@ async fn timestamp_unit_mismatch_errors_with_constant_children()
16961698
16971699 Ok ( ( ) )
16981700}
1701+
1702+ /// Collect all segment byte offsets reachable from a layout node.
1703+ fn collect_segment_offsets ( layout : & dyn Layout , segment_specs : & [ SegmentSpec ] ) -> Vec < u64 > {
1704+ let mut result = Vec :: new ( ) ;
1705+ collect_segment_offsets_inner ( layout, segment_specs, & mut result) ;
1706+ result
1707+ }
1708+
1709+ fn collect_segment_offsets_inner (
1710+ layout : & dyn Layout ,
1711+ segment_specs : & [ SegmentSpec ] ,
1712+ result : & mut Vec < u64 > ,
1713+ ) {
1714+ for seg_id in layout. segment_ids ( ) {
1715+ result. push ( segment_specs[ * seg_id as usize ] . offset ) ;
1716+ }
1717+ for child in layout. children ( ) . unwrap ( ) {
1718+ collect_segment_offsets_inner ( child. as_ref ( ) , segment_specs, result) ;
1719+ }
1720+ }
1721+
1722+ /// Assert that all offsets in `before` are less than all offsets in `after`.
1723+ fn assert_offsets_ordered ( before : & [ u64 ] , after : & [ u64 ] , context : & str ) {
1724+ if let ( Some ( & max_before) , Some ( & min_after) ) = ( before. iter ( ) . max ( ) , after. iter ( ) . min ( ) ) {
1725+ assert ! (
1726+ max_before < min_after,
1727+ "{context}: expected all 'before' offsets < all 'after' offsets, \
1728+ but max before = {max_before} >= min after = {min_after}"
1729+ ) ;
1730+ }
1731+ }
1732+
1733+ #[ tokio:: test]
1734+ #[ cfg_attr( miri, ignore) ]
1735+ async fn test_segment_ordering_dict_codes_before_values ( ) -> VortexResult < ( ) > {
1736+ // Create low-cardinality strings to trigger dict encoding, plus an integer column.
1737+ let n = 100_000 ;
1738+ let values: Vec < & str > = ( 0 ..n) . map ( |i| [ "alpha" , "beta" , "gamma" ] [ i % 3 ] ) . collect ( ) ;
1739+ let strings = VarBinArray :: from ( values) . into_array ( ) ;
1740+ let numbers = PrimitiveArray :: from_iter ( 0 ..n as i32 ) . into_array ( ) ;
1741+
1742+ let st = StructArray :: from_fields ( & [ ( "strings" , strings) , ( "numbers" , numbers) ] ) . unwrap ( ) ;
1743+
1744+ let mut buf = ByteBufferMut :: empty ( ) ;
1745+ let summary = SESSION
1746+ . write_options ( )
1747+ . write ( & mut buf, st. to_array_stream ( ) )
1748+ . await ?;
1749+
1750+ let footer = summary. footer ( ) ;
1751+ let segment_specs = footer. segment_map ( ) ;
1752+ let root = footer. layout ( ) ;
1753+
1754+ // Walk the layout tree and find all dict layouts.
1755+ // Verify codes segments come before values segments in byte order within each run.
1756+ fn check_dict_ordering ( layout : & dyn Layout , segment_specs : & [ SegmentSpec ] ) {
1757+ if layout. encoding_id ( ) . as_ref ( ) == "vortex.dict" {
1758+ // child 0 = values, child 1 = codes
1759+ let values_offsets =
1760+ collect_segment_offsets ( layout. child ( 0 ) . unwrap ( ) . as_ref ( ) , segment_specs) ;
1761+ let codes_offsets =
1762+ collect_segment_offsets ( layout. child ( 1 ) . unwrap ( ) . as_ref ( ) , segment_specs) ;
1763+
1764+ assert_offsets_ordered (
1765+ & codes_offsets,
1766+ & values_offsets,
1767+ "dict: codes should come before values" ,
1768+ ) ;
1769+ }
1770+
1771+ for child in layout. children ( ) . unwrap ( ) {
1772+ check_dict_ordering ( child. as_ref ( ) , segment_specs) ;
1773+ }
1774+ }
1775+
1776+ check_dict_ordering ( root. as_ref ( ) , segment_specs) ;
1777+
1778+ Ok ( ( ) )
1779+ }
1780+
1781+ #[ tokio:: test]
1782+ #[ cfg_attr( miri, ignore) ]
1783+ async fn test_segment_ordering_zonemaps_after_data ( ) -> VortexResult < ( ) > {
1784+ // Create a multi-column struct with enough rows to produce zone maps.
1785+ let n = 100_000 ;
1786+ let values: Vec < & str > = ( 0 ..n) . map ( |i| [ "alpha" , "beta" , "gamma" ] [ i % 3 ] ) . collect ( ) ;
1787+ let strings = VarBinArray :: from ( values) . into_array ( ) ;
1788+ let numbers = PrimitiveArray :: from_iter ( 0 ..n as i32 ) . into_array ( ) ;
1789+ let floats = PrimitiveArray :: from_iter ( ( 0 ..n) . map ( |i| i as f64 * 0.1 ) ) . into_array ( ) ;
1790+
1791+ let st = StructArray :: from_fields ( & [
1792+ ( "strings" , strings) ,
1793+ ( "numbers" , numbers) ,
1794+ ( "floats" , floats) ,
1795+ ] )
1796+ . unwrap ( ) ;
1797+
1798+ let mut buf = ByteBufferMut :: empty ( ) ;
1799+ let summary = SESSION
1800+ . write_options ( )
1801+ . write ( & mut buf, st. to_array_stream ( ) )
1802+ . await ?;
1803+
1804+ let footer = summary. footer ( ) ;
1805+ let segment_specs = footer. segment_map ( ) ;
1806+ let root = footer. layout ( ) ;
1807+
1808+ // Find all zoned layouts and verify data segments come before zone map segments.
1809+ fn check_zoned_ordering ( layout : & dyn Layout , segment_specs : & [ SegmentSpec ] ) {
1810+ if layout. encoding_id ( ) . as_ref ( ) == "vortex.stats" {
1811+ // child 0 = data, child 1 = zones
1812+ let data_offsets =
1813+ collect_segment_offsets ( layout. child ( 0 ) . unwrap ( ) . as_ref ( ) , segment_specs) ;
1814+ let zones_offsets =
1815+ collect_segment_offsets ( layout. child ( 1 ) . unwrap ( ) . as_ref ( ) , segment_specs) ;
1816+
1817+ assert_offsets_ordered (
1818+ & data_offsets,
1819+ & zones_offsets,
1820+ "zoned: data should come before zones" ,
1821+ ) ;
1822+ }
1823+
1824+ for child in layout. children ( ) . unwrap ( ) {
1825+ check_zoned_ordering ( child. as_ref ( ) , segment_specs) ;
1826+ }
1827+ }
1828+
1829+ check_zoned_ordering ( root. as_ref ( ) , segment_specs) ;
1830+
1831+ // Additionally: all zone map segments across all columns should appear after
1832+ // all data segments across all columns.
1833+ let mut all_data_offsets = Vec :: new ( ) ;
1834+ let mut all_zones_offsets = Vec :: new ( ) ;
1835+
1836+ fn collect_all_zoned (
1837+ layout : & dyn Layout ,
1838+ segment_specs : & [ SegmentSpec ] ,
1839+ all_data : & mut Vec < u64 > ,
1840+ all_zones : & mut Vec < u64 > ,
1841+ ) {
1842+ if layout. encoding_id ( ) . as_ref ( ) == "vortex.stats" {
1843+ // child 0 = data, child 1 = zones
1844+ all_data. extend ( collect_segment_offsets (
1845+ layout. child ( 0 ) . unwrap ( ) . as_ref ( ) ,
1846+ segment_specs,
1847+ ) ) ;
1848+ all_zones. extend ( collect_segment_offsets (
1849+ layout. child ( 1 ) . unwrap ( ) . as_ref ( ) ,
1850+ segment_specs,
1851+ ) ) ;
1852+ return ;
1853+ }
1854+ for child in layout. children ( ) . unwrap ( ) {
1855+ collect_all_zoned ( child. as_ref ( ) , segment_specs, all_data, all_zones) ;
1856+ }
1857+ }
1858+
1859+ collect_all_zoned (
1860+ root. as_ref ( ) ,
1861+ segment_specs,
1862+ & mut all_data_offsets,
1863+ & mut all_zones_offsets,
1864+ ) ;
1865+
1866+ assert_offsets_ordered (
1867+ & all_data_offsets,
1868+ & all_zones_offsets,
1869+ "global: all data segments should come before all zone map segments" ,
1870+ ) ;
1871+
1872+ Ok ( ( ) )
1873+ }
0 commit comments