@@ -7,6 +7,7 @@ use vortex_buffer::Buffer;
77use vortex_buffer:: BufferMut ;
88use vortex_error:: VortexResult ;
99use vortex_error:: vortex_ensure;
10+ use vortex_error:: vortex_err;
1011
1112use crate :: ArrayRef ;
1213use crate :: Canonical ;
@@ -27,26 +28,91 @@ use crate::stats::ArrayStats;
2728use crate :: validity:: Validity ;
2829
2930/// An array that partially "patches" another array with new values.
31+ ///
32+ /// # Background
33+ ///
34+ /// This is meant to be the foundation of a fully data-parallel patching strategy, based on the
35+ /// work published in ["G-ALP" from Hepkema et al.](https://ir.cwi.nl/pub/35205/35205.pdf)
36+ ///
37+ /// Patching is common when an encoding almost completely covers an array save a few exceptions.
38+ /// In that case, rather than avoid the encoding entirely, it's preferable to
39+ ///
40+ /// * Replace unencodable values with fillers (zeros, frequent values, nulls, etc.)
41+ /// * Wrap the array with a `PatchedArray` signaling that when the original array is executed,
42+ /// some of the decoded values must be overwritten.
43+ ///
44+ /// In Vortex, the FastLanes bit-packing encoding is often the terminal node in an encoding tree,
45+ /// and FastLanes has an intrinsic chunking of 1024 elements. Thus, 1024 elements is pervasively
46+ /// a useful unit of chunking throughout Vortex, and so we use 1024 as a chunk size here
47+ /// as well.
48+ ///
49+ /// # Details
50+ ///
51+ /// To patch an array, we first divide it into a set of chunks of length 1024, and then within
52+ /// each chunk, we assign each position to a lane. The number of lanes depends on the width of
53+ /// the underlying type.
54+ ///
55+ /// Thus, rather than sorting patch indices and values by their global offset, they are sorted
56+ /// primarily by their chunk, and then subsequently by their lanes.
57+ ///
58+ /// The Patched array layout has 4 children
59+ ///
60+ /// * `inner`: the inner array is the one containing encoded values, including the filler values
61+ /// that need to be patched over at execution time
62+ /// * `lane_offsets`: this is an indexing buffer that allows you to see into ranges of the other
63+ /// two children
64+ /// * `indices`: An array of `u16` chunk indices, indicating where within the chunk should the value
65+ /// be overwritten by the patch value
66+ /// * `values`: The child array containing the patch values, which should be inserted over
67+ /// the values of the `inner` at the locations provided by `indices`
68+ ///
69+ /// `indices` and `values` are aligned and accessed together.
70+ ///
71+ /// ```text
72+ ///
73+ /// chunk 0 chunk 0 chunk 0 chunk 0 chunk 0 chunk 0
74+ /// lane 0 lane 1 lane 2 lane 3 lane 4 lane 5
75+ /// ┌────────────┬────────────┬────────────┬────────────┬────────────┬────────────┐
76+ /// lane_offsets │ 0 │ 0 │ 2 │ 2 │ 3 │ 5 │ ...
77+ /// └─────┬──────┴─────┬──────┴─────┬──────┴──────┬─────┴──────┬─────┴──────┬─────┘
78+ /// │ │ │ │ │ │
79+ /// │ │ │ │ │ │
80+ /// ┌─────┴────────────┘ └──────┬──────┘ ┌──────┘ └─────┐
81+ /// │ │ │ │
82+ /// │ │ │ │
83+ /// │ │ │ │
84+ /// ▼────────────┬────────────┬────────────▼────────────▼────────────┬────────────▼
85+ /// indices │ │ │ │ │ │ │
86+ /// │ │ │ │ │ │ │
87+ /// ├────────────┼────────────┼────────────┼────────────┼────────────┼────────────┤
88+ /// values │ │ │ │ │ │ │
89+ /// │ │ │ │ │ │ │
90+ /// └────────────┴────────────┴────────────┴────────────┴────────────┴────────────┘
91+ /// ```
92+ ///
93+ /// It turns out that this layout is optimal for executing patching on GPUs, because the
94+ /// `lane_offsets` allows each thread in a warp to seek to its patches in constant time.
3095#[ derive( Debug , Clone ) ]
3196pub struct PatchedArray {
3297 /// The inner array that is being patched. This is the zeroth child.
3398 pub ( super ) inner : ArrayRef ,
3499
35- /// Number of 1024-element chunks. Pre-computed for convenience.
36- pub ( super ) n_chunks : usize ,
37-
38100 /// Number of lanes the patch indices and values have been split into. Each of the `n_chunks`
39101 /// of 1024 values is split into `n_lanes` lanes horizontally, each lane having 1024 / n_lanes
40102 /// values that might be patched.
41103 pub ( super ) n_lanes : usize ,
42104
43- /// Offset into the first chunk
105+ /// The offset into that first chunk that is considered in bounds.
106+ ///
107+ /// The patch indices of the first chunk less than `offset` should be skipped, and the offset
108+ /// should be subtracted out of the remaining offsets to get their final position in the
109+ /// executed array.
44110 pub ( super ) offset : usize ,
45111 /// Total length.
46112 pub ( super ) len : usize ,
47113
48114 /// lane offsets. The PType of these MUST be u32
49- pub ( super ) lane_offsets : BufferHandle ,
115+ pub ( super ) lane_offsets : ArrayRef ,
50116 /// indices within a 1024-element chunk. The PType of these MUST be u16
51117 pub ( super ) indices : ArrayRef ,
52118 /// patch values corresponding to the indices. The ptype is specified by `values_ptype`.
@@ -84,13 +150,19 @@ impl PatchedArray {
84150 let values_ptype = patches. dtype ( ) . as_ptype ( ) ;
85151
86152 let TransposedPatches {
87- n_chunks,
88153 n_lanes,
89154 lane_offsets,
90155 indices,
91156 values,
157+ ..
92158 } = transpose_patches ( patches, ctx) ?;
93159
160+ let lane_offsets = PrimitiveArray :: from_buffer_handle (
161+ BufferHandle :: new_host ( lane_offsets) ,
162+ PType :: U32 ,
163+ Validity :: NonNullable ,
164+ )
165+ . into_array ( ) ;
94166 let indices = PrimitiveArray :: from_buffer_handle (
95167 BufferHandle :: new_host ( indices) ,
96168 PType :: U16 ,
@@ -108,11 +180,10 @@ impl PatchedArray {
108180
109181 Ok ( Self {
110182 inner,
111- n_chunks,
112183 n_lanes,
113184 offset : 0 ,
114185 len,
115- lane_offsets : BufferHandle :: new_host ( lane_offsets ) ,
186+ lane_offsets,
116187 indices,
117188 values,
118189 stats_set : ArrayStats :: default ( ) ,
@@ -127,16 +198,26 @@ impl PatchedArray {
127198 /// # Panics
128199 ///
129200 /// Note that this function will panic if the caller requests out of bounds chunk/lane ordinals.
130- pub ( crate ) fn lane_range ( & self , chunk : usize , lane : usize ) -> Range < usize > {
131- assert ! ( chunk < self . n_chunks ) ;
201+ pub ( crate ) fn lane_range ( & self , chunk : usize , lane : usize ) -> VortexResult < Range < usize > > {
202+ assert ! ( chunk * 1024 <= self . len + self . offset ) ;
132203 assert ! ( lane < self . n_lanes) ;
133204
134- let lane_offsets = self . lane_offsets . as_host ( ) . reinterpret :: < u32 > ( ) ;
205+ let start = self . lane_offsets . scalar_at ( chunk * self . n_lanes + lane) ?;
206+ let stop = self
207+ . lane_offsets
208+ . scalar_at ( chunk * self . n_lanes + lane + 1 ) ?;
209+
210+ let start = start
211+ . as_primitive ( )
212+ . as_ :: < usize > ( )
213+ . ok_or_else ( || vortex_err ! ( "could not cast lane_offset to usize" ) ) ?;
135214
136- let start = lane_offsets[ chunk * self . n_lanes + lane] as usize ;
137- let stop = lane_offsets[ chunk * self . n_lanes + lane + 1 ] as usize ;
215+ let stop = stop
216+ . as_primitive ( )
217+ . as_ :: < usize > ( )
218+ . ok_or_else ( || vortex_err ! ( "could not cast lane_offset to usize" ) ) ?;
138219
139- start..stop
220+ Ok ( start..stop)
140221 }
141222
142223 /// Slice the array to just the patches and inner values that are within the chunk range.
@@ -146,7 +227,7 @@ impl PatchedArray {
146227
147228 let sliced_lane_offsets = self
148229 . lane_offsets
149- . slice_typed :: < u32 > ( lane_offsets_start..lane_offsets_stop) ;
230+ . slice ( lane_offsets_start..lane_offsets_stop) ? ;
150231 let indices = self . indices . clone ( ) ;
151232 let values = self . values . clone ( ) ;
152233
@@ -158,11 +239,9 @@ impl PatchedArray {
158239 let inner = self . inner . slice ( begin..end) ?;
159240
160241 let len = end - begin;
161- let n_chunks = ( end - begin) . div_ceil ( 1024 ) ;
162242
163243 Ok ( PatchedArray {
164244 inner,
165- n_chunks,
166245 n_lanes : self . n_lanes ,
167246 offset,
168247 len,
@@ -281,7 +360,6 @@ fn transpose<I: IntegerPType, V: NativePType>(
281360 }
282361
283362 TransposedPatches {
284- n_chunks,
285363 n_lanes,
286364 lane_offsets : lane_offsets. freeze ( ) . into_byte_buffer ( ) ,
287365 indices : indices_buffer. freeze ( ) . into_byte_buffer ( ) ,
0 commit comments