Skip to content

Commit 33907bf

Browse files
committed
use child
Signed-off-by: Andrew Duffy <andrew@a10y.dev>
1 parent e874e3f commit 33907bf

8 files changed

Lines changed: 140 additions & 65 deletions

File tree

vortex-array/src/arrays/patched/array.rs

Lines changed: 96 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ use vortex_buffer::Buffer;
77
use vortex_buffer::BufferMut;
88
use vortex_error::VortexResult;
99
use vortex_error::vortex_ensure;
10+
use vortex_error::vortex_err;
1011

1112
use crate::ArrayRef;
1213
use crate::Canonical;
@@ -27,26 +28,91 @@ use crate::stats::ArrayStats;
2728
use crate::validity::Validity;
2829

2930
/// An array that partially "patches" another array with new values.
31+
///
32+
/// # Background
33+
///
34+
/// This is meant to be the foundation of a fully data-parallel patching strategy, based on the
35+
/// work published in ["G-ALP" from Hepkema et al.](https://ir.cwi.nl/pub/35205/35205.pdf)
36+
///
37+
/// Patching is common when an encoding almost completely covers an array save a few exceptions.
38+
/// In that case, rather than avoid the encoding entirely, it's preferable to
39+
///
40+
/// * Replace unencodable values with fillers (zeros, frequent values, nulls, etc.)
41+
/// * Wrap the array with a `PatchedArray` signaling that when the original array is executed,
42+
/// some of the decoded values must be overwritten.
43+
///
44+
/// In Vortex, the FastLanes bit-packing encoding is often the terminal node in an encoding tree,
45+
/// and FastLanes has an intrinsic chunking of 1024 elements. Thus, 1024 elements is pervasively
46+
/// a useful unit of chunking throughout Vortex, and so we use 1024 as a chunk size here
47+
/// as well.
48+
///
49+
/// # Details
50+
///
51+
/// To patch an array, we first divide it into a set of chunks of length 1024, and then within
52+
/// each chunk, we assign each position to a lane. The number of lanes depends on the width of
53+
/// the underlying type.
54+
///
55+
/// Thus, rather than sorting patch indices and values by their global offset, they are sorted
56+
/// primarily by their chunk, and then subsequently by their lanes.
57+
///
58+
/// The Patched array layout has 4 children
59+
///
60+
/// * `inner`: the inner array is the one containing encoded values, including the filler values
61+
/// that need to be patched over at execution time
62+
/// * `lane_offsets`: this is an indexing buffer that allows you to see into ranges of the other
63+
/// two children
64+
/// * `indices`: An array of `u16` chunk indices, indicating where within the chunk should the value
65+
/// be overwritten by the patch value
66+
/// * `values`: The child array containing the patch values, which should be inserted over
67+
/// the values of the `inner` at the locations provided by `indices`
68+
///
69+
/// `indices` and `values` are aligned and accessed together.
70+
///
71+
/// ```text
72+
///
73+
/// chunk 0 chunk 0 chunk 0 chunk 0 chunk 0 chunk 0
74+
/// lane 0 lane 1 lane 2 lane 3 lane 4 lane 5
75+
/// ┌────────────┬────────────┬────────────┬────────────┬────────────┬────────────┐
76+
/// lane_offsets │ 0 │ 0 │ 2 │ 2 │ 3 │ 5 │ ...
77+
/// └─────┬──────┴─────┬──────┴─────┬──────┴──────┬─────┴──────┬─────┴──────┬─────┘
78+
/// │ │ │ │ │ │
79+
/// │ │ │ │ │ │
80+
/// ┌─────┴────────────┘ └──────┬──────┘ ┌──────┘ └─────┐
81+
/// │ │ │ │
82+
/// │ │ │ │
83+
/// │ │ │ │
84+
/// ▼────────────┬────────────┬────────────▼────────────▼────────────┬────────────▼
85+
/// indices │ │ │ │ │ │ │
86+
/// │ │ │ │ │ │ │
87+
/// ├────────────┼────────────┼────────────┼────────────┼────────────┼────────────┤
88+
/// values │ │ │ │ │ │ │
89+
/// │ │ │ │ │ │ │
90+
/// └────────────┴────────────┴────────────┴────────────┴────────────┴────────────┘
91+
/// ```
92+
///
93+
/// It turns out that this layout is optimal for executing patching on GPUs, because the
94+
/// `lane_offsets` allows each thread in a warp to seek to its patches in constant time.
3095
#[derive(Debug, Clone)]
3196
pub struct PatchedArray {
3297
/// The inner array that is being patched. This is the zeroth child.
3398
pub(super) inner: ArrayRef,
3499

35-
/// Number of 1024-element chunks. Pre-computed for convenience.
36-
pub(super) n_chunks: usize,
37-
38100
/// Number of lanes the patch indices and values have been split into. Each of the `n_chunks`
39101
/// of 1024 values is split into `n_lanes` lanes horizontally, each lane having 1024 / n_lanes
40102
/// values that might be patched.
41103
pub(super) n_lanes: usize,
42104

43-
/// Offset into the first chunk
105+
/// The offset into that first chunk that is considered in bounds.
106+
///
107+
/// The patch indices of the first chunk less than `offset` should be skipped, and the offset
108+
/// should be subtracted out of the remaining offsets to get their final position in the
109+
/// executed array.
44110
pub(super) offset: usize,
45111
/// Total length.
46112
pub(super) len: usize,
47113

48114
/// lane offsets. The PType of these MUST be u32
49-
pub(super) lane_offsets: BufferHandle,
115+
pub(super) lane_offsets: ArrayRef,
50116
/// indices within a 1024-element chunk. The PType of these MUST be u16
51117
pub(super) indices: ArrayRef,
52118
/// patch values corresponding to the indices. The ptype is specified by `values_ptype`.
@@ -84,13 +150,19 @@ impl PatchedArray {
84150
let values_ptype = patches.dtype().as_ptype();
85151

86152
let TransposedPatches {
87-
n_chunks,
88153
n_lanes,
89154
lane_offsets,
90155
indices,
91156
values,
157+
..
92158
} = transpose_patches(patches, ctx)?;
93159

160+
let lane_offsets = PrimitiveArray::from_buffer_handle(
161+
BufferHandle::new_host(lane_offsets),
162+
PType::U32,
163+
Validity::NonNullable,
164+
)
165+
.into_array();
94166
let indices = PrimitiveArray::from_buffer_handle(
95167
BufferHandle::new_host(indices),
96168
PType::U16,
@@ -108,11 +180,10 @@ impl PatchedArray {
108180

109181
Ok(Self {
110182
inner,
111-
n_chunks,
112183
n_lanes,
113184
offset: 0,
114185
len,
115-
lane_offsets: BufferHandle::new_host(lane_offsets),
186+
lane_offsets,
116187
indices,
117188
values,
118189
stats_set: ArrayStats::default(),
@@ -127,16 +198,26 @@ impl PatchedArray {
127198
/// # Panics
128199
///
129200
/// Note that this function will panic if the caller requests out of bounds chunk/lane ordinals.
130-
pub(crate) fn lane_range(&self, chunk: usize, lane: usize) -> Range<usize> {
131-
assert!(chunk < self.n_chunks);
201+
pub(crate) fn lane_range(&self, chunk: usize, lane: usize) -> VortexResult<Range<usize>> {
202+
assert!(chunk * 1024 <= self.len + self.offset);
132203
assert!(lane < self.n_lanes);
133204

134-
let lane_offsets = self.lane_offsets.as_host().reinterpret::<u32>();
205+
let start = self.lane_offsets.scalar_at(chunk * self.n_lanes + lane)?;
206+
let stop = self
207+
.lane_offsets
208+
.scalar_at(chunk * self.n_lanes + lane + 1)?;
209+
210+
let start = start
211+
.as_primitive()
212+
.as_::<usize>()
213+
.ok_or_else(|| vortex_err!("could not cast lane_offset to usize"))?;
135214

136-
let start = lane_offsets[chunk * self.n_lanes + lane] as usize;
137-
let stop = lane_offsets[chunk * self.n_lanes + lane + 1] as usize;
215+
let stop = stop
216+
.as_primitive()
217+
.as_::<usize>()
218+
.ok_or_else(|| vortex_err!("could not cast lane_offset to usize"))?;
138219

139-
start..stop
220+
Ok(start..stop)
140221
}
141222

142223
/// Slice the array to just the patches and inner values that are within the chunk range.
@@ -146,7 +227,7 @@ impl PatchedArray {
146227

147228
let sliced_lane_offsets = self
148229
.lane_offsets
149-
.slice_typed::<u32>(lane_offsets_start..lane_offsets_stop);
230+
.slice(lane_offsets_start..lane_offsets_stop)?;
150231
let indices = self.indices.clone();
151232
let values = self.values.clone();
152233

@@ -158,11 +239,9 @@ impl PatchedArray {
158239
let inner = self.inner.slice(begin..end)?;
159240

160241
let len = end - begin;
161-
let n_chunks = (end - begin).div_ceil(1024);
162242

163243
Ok(PatchedArray {
164244
inner,
165-
n_chunks,
166245
n_lanes: self.n_lanes,
167246
offset,
168247
len,
@@ -281,7 +360,6 @@ fn transpose<I: IntegerPType, V: NativePType>(
281360
}
282361

283362
TransposedPatches {
284-
n_chunks,
285363
n_lanes,
286364
lane_offsets: lane_offsets.freeze().into_byte_buffer(),
287365
indices: indices_buffer.freeze().into_byte_buffer(),

vortex-array/src/arrays/patched/compute/compare.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ impl CompareKernel for Patched {
5858

5959
let mut bits = BitBufferMut::from_buffer(bits.unwrap_host().into_mut(), offset, len);
6060

61-
let lane_offsets = lhs.lane_offsets.as_host().reinterpret::<u32>();
61+
let lane_offsets = lhs.lane_offsets.clone().execute::<PrimitiveArray>(ctx)?;
6262
let indices = lhs.indices.clone().execute::<PrimitiveArray>(ctx)?;
6363
let values = lhs.values.clone().execute::<PrimitiveArray>(ctx)?;
6464
let n_lanes = lhs.n_lanes;
@@ -76,7 +76,7 @@ impl CompareKernel for Patched {
7676
bits: &mut bits,
7777
offset,
7878
n_lanes,
79-
lane_offsets,
79+
lane_offsets: lane_offsets.as_slice::<u32>(),
8080
indices,
8181
values,
8282
constant,

vortex-array/src/arrays/patched/compute/filter.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,10 @@ impl FilterReduce for Patched {
3232
}
3333
};
3434

35+
let n_chunks = (array.offset + array.len).div_ceil(1024);
36+
3537
// If all chunks already covered, there is nothing to do.
36-
if chunk_start == 0 && chunk_stop == array.n_chunks {
38+
if chunk_start == 0 && chunk_stop == n_chunks {
3739
return Ok(None);
3840
}
3941

vortex-array/src/arrays/patched/compute/take.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ impl TakeExecute for Patched {
4646
match_each_unsigned_integer_ptype!(indices_ptype, |I| {
4747
match_each_native_ptype!(ptype, |V| {
4848
let indices = indices.clone().execute::<PrimitiveArray>(ctx)?;
49+
let lane_offsets = array.lane_offsets.clone().execute::<PrimitiveArray>(ctx)?;
4950
let patch_indices = array.indices.clone().execute::<PrimitiveArray>(ctx)?;
5051
let patch_values = array.values.clone().execute::<PrimitiveArray>(ctx)?;
5152
let mut output = Buffer::<V>::from_byte_buffer(buffer.unwrap_host()).into_mut();
@@ -54,9 +55,8 @@ impl TakeExecute for Patched {
5455
indices.as_slice::<I>(),
5556
array.offset,
5657
array.len,
57-
array.n_chunks,
5858
array.n_lanes,
59-
array.lane_offsets.as_host().reinterpret::<u32>(),
59+
lane_offsets.as_slice::<u32>(),
6060
patch_indices.as_slice::<u16>(),
6161
patch_values.as_slice::<V>(),
6262
);
@@ -82,12 +82,12 @@ fn take_map<I: IntegerPType, V: NativePType>(
8282
indices: &[I],
8383
offset: usize,
8484
len: usize,
85-
n_chunks: usize,
8685
n_lanes: usize,
8786
lane_offsets: &[u32],
8887
patch_index: &[u16],
8988
patch_value: &[V],
9089
) {
90+
let n_chunks = (offset + len).div_ceil(1024);
9191
// Build a hashmap of patch_index -> values.
9292
let mut index_map = FxHashMap::with_capacity_and_hasher(indices.len(), Default::default());
9393
for chunk in 0..n_chunks {

vortex-array/src/arrays/patched/mod.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ pub use vtable::*;
1111

1212
/// Patches that have been transposed into GPU format.
1313
struct TransposedPatches {
14-
n_chunks: usize,
1514
n_lanes: usize,
1615
lane_offsets: ByteBuffer,
1716
indices: ByteBuffer,

0 commit comments

Comments
 (0)