Skip to content

Commit f9ab66b

Browse files
authored
Expanding conceptual docs (and some other minor docs) (#8552)
## Summary Adds more docs, both module level conceptual docs which pulls from `docs/`, and many local changes documenting specific functions or types. Also tried to include basic examples. The main changes are in `vortex`, `vortex-array` and `vortex-file` and `vortex-layout`. Was done with the help of Claude to try and find gaps/mistakes, and fill in some of the obvious changes. --------- Signed-off-by: Adam Gutglick <adam@spiraldb.com>
1 parent ba700ca commit f9ab66b

55 files changed

Lines changed: 972 additions & 128 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

encodings/fastlanes/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ repository = { workspace = true }
1313
rust-version = { workspace = true }
1414
version = { workspace = true }
1515

16+
[package.metadata.docs.rs]
17+
all-features = true
18+
1619
[lints]
1720
workspace = true
1821

encodings/fastlanes/src/lib.rs

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,29 @@
33

44
#![expect(clippy::cast_possible_truncation)]
55

6+
//! FastLanes integer encodings for Vortex arrays.
7+
//!
8+
//! This crate provides SIMD-friendly integer encodings:
9+
//!
10+
//! - [`BitPacked`] stores fixed-width integer values using the minimum bit width plus optional
11+
//! patches.
12+
//! - [`FoR`] stores frame-of-reference deltas from a base value.
13+
//! - [`Delta`] stores adjacent deltas in chunked form.
14+
//! - [`RLE`] stores repeated runs.
15+
//!
16+
//! Call [`initialize`] to register the encodings and encoding-specific aggregate kernels in a
17+
//! session before deserializing or executing arrays that may contain these encodings.
18+
//!
19+
//! ```rust
20+
//! let session = vortex_array::array_session();
21+
//! vortex_fastlanes::initialize(&session);
22+
//! ```
23+
//!
24+
//! ## Paper
25+
//!
26+
//! The original encodings are described in the paper [The FastLanes Compression Layout](https://15721.courses.cs.cmu.edu/spring2024/papers/03-data2/p2132-afroozeh.pdf),
27+
//! but are not fully binary compatible. See the underlying [fastlanes](https://github.com/spiraldb/fastlanes) crate for more details.
28+
629
pub use bitpacking::*;
730
pub use delta::*;
831
pub use r#for::*;

encodings/pco/src/array.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,7 @@ pub(crate) fn vortex_err_from_pco(err: PcoError) -> VortexError {
282282
}
283283

284284
#[derive(Clone, Debug)]
285+
/// Pco array encoding marker.
285286
pub struct Pco;
286287

287288
impl Pco {
@@ -317,6 +318,7 @@ pub(super) const NUM_SLOTS: usize = 1;
317318
pub(super) const SLOT_NAMES: [&str; NUM_SLOTS] = ["validity"];
318319

319320
#[derive(Clone, Debug)]
321+
/// Encoding-specific data for a [`PcoArray`].
320322
pub struct PcoData {
321323
pub(crate) chunk_metas: Vec<ByteBuffer>,
322324
pub(crate) pages: Vec<ByteBuffer>,
@@ -338,6 +340,7 @@ impl Display for PcoData {
338340
}
339341

340342
impl PcoData {
343+
/// Validate dtype, validity, slice, and Pco component invariants.
341344
pub fn validate(&self, dtype: &DType, len: usize, validity: &Validity) -> VortexResult<()> {
342345
let _ = number_type_from_ptype(self.ptype);
343346
vortex_ensure!(
@@ -391,6 +394,7 @@ impl PcoData {
391394
Ok(())
392395
}
393396

397+
/// Construct unsliced Pco data from chunk metadata, pages, and serialized metadata.
394398
pub fn new(
395399
chunk_metas: Vec<ByteBuffer>,
396400
pages: Vec<ByteBuffer>,
@@ -409,6 +413,7 @@ impl PcoData {
409413
}
410414
}
411415

416+
/// Compress a primitive array into Pco data.
412417
pub fn from_primitive(
413418
parray: ArrayView<'_, Primitive>,
414419
level: usize,
@@ -497,6 +502,11 @@ impl PcoData {
497502
))
498503
}
499504

505+
/// Downcast and compress an array into Pco data.
506+
///
507+
/// # Errors
508+
///
509+
/// Returns an error if the input is not a primitive array or compression fails.
500510
pub fn from_array(
501511
array: ArrayRef,
502512
level: usize,
@@ -512,6 +522,7 @@ impl PcoData {
512522
Self::from_primitive(parray.as_view(), level, nums_per_page, ctx)
513523
}
514524

525+
/// Decompress this Pco data into a primitive array.
515526
pub fn decompress(
516527
&self,
517528
unsliced_validity: &Validity,

encodings/pco/src/lib.rs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,23 @@
11
// SPDX-License-Identifier: Apache-2.0
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

4+
//! Pco-backed numeric compression encoding for Vortex arrays.
5+
//!
6+
//! [`PcoArray`] stores valid primitive numeric values in Pco chunks and pages, while Vortex
7+
//! validity tracks null rows separately. Page metadata lets slices decompress only the components
8+
//! required for the requested row range.
9+
//!
10+
//! Pco supports integer and floating-point primitive dtypes handled by the upstream `pco` crate.
11+
//! It is normally selected through the BtrBlocks compressor when the `pco` feature is enabled.
12+
//! To deserialize arrays manually, register the encoding in the array session:
13+
//!
14+
//! ```rust
15+
//! use vortex_array::session::ArraySessionExt;
16+
//!
17+
//! let session = vortex_array::array_session();
18+
//! session.arrays().register(vortex_pco::Pco);
19+
//! ```
20+
421
mod array;
522
mod compute;
623
mod rules;
@@ -9,27 +26,34 @@ mod slice;
926
pub use array::*;
1027

1128
#[derive(Clone, prost::Message)]
29+
/// Metadata for one Pco page.
1230
pub struct PcoPageInfo {
1331
// Since pco limits to 2^24 values per chunk, u32 is sufficient for the
1432
// count of values.
33+
/// Number of valid primitive values stored in this page.
1534
#[prost(uint32, tag = "1")]
1635
pub n_values: u32,
1736
}
1837

1938
// We're calling this Info instead of Metadata because ChunkMeta refers to a specific
2039
// component of a Pco file.
2140
#[derive(Clone, prost::Message)]
41+
/// Metadata for one Pco chunk.
2242
pub struct PcoChunkInfo {
43+
/// Pages contained in this chunk.
2344
#[prost(message, repeated, tag = "1")]
2445
pub pages: Vec<PcoPageInfo>,
2546
}
2647

2748
#[derive(Clone, prost::Message)]
49+
/// Serialized metadata for a [`PcoArray`].
2850
pub struct PcoMetadata {
2951
// would be nice to reuse one header per vortex file, but it's really only 1 byte, so
3052
// no issue duplicating it here per PcoArray
53+
/// Pco file header bytes.
3154
#[prost(bytes, tag = "1")]
3255
pub header: Vec<u8>,
56+
/// Metadata for each compressed chunk.
3357
#[prost(message, repeated, tag = "2")]
3458
pub chunks: Vec<PcoChunkInfo>,
3559
}

encodings/zstd/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ repository = { workspace = true }
1313
rust-version = { workspace = true }
1414
version = { workspace = true }
1515

16+
[package.metadata.docs.rs]
17+
all-features = true
18+
1619
[lints]
1720
workspace = true
1821

encodings/zstd/src/array.rs

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,9 +252,11 @@ impl VTable for Zstd {
252252
}
253253

254254
#[derive(Clone, Debug)]
255+
/// Zstd array encoding marker.
255256
pub struct Zstd;
256257

257258
impl Zstd {
259+
/// Construct a [`ZstdArray`] from validated compressed data and validity.
258260
pub fn try_new(dtype: DType, data: ZstdData, validity: Validity) -> VortexResult<ZstdArray> {
259261
let len = data.len();
260262
data.validate(&dtype, len, &validity)?;
@@ -309,6 +311,7 @@ impl Zstd {
309311
)
310312
}
311313

314+
/// Decompress a [`ZstdArray`] into its canonical Vortex representation.
312315
pub fn decompress(array: &ZstdArray, ctx: &mut ExecutionCtx) -> VortexResult<ArrayRef> {
313316
let unsliced_validity = child_to_validity(
314317
array.as_ref().slots()[0].as_ref(),
@@ -325,6 +328,7 @@ pub(super) const NUM_SLOTS: usize = 1;
325328
pub(super) const SLOT_NAMES: [&str; NUM_SLOTS] = ["validity"];
326329

327330
#[derive(Clone, Debug)]
331+
/// Encoding-specific data for a [`ZstdArray`].
328332
pub struct ZstdData {
329333
pub(crate) dictionary: Option<ByteBuffer>,
330334
pub(crate) frames: Vec<ByteBuffer>,
@@ -344,13 +348,21 @@ impl Display for ZstdData {
344348
}
345349
}
346350

351+
/// Movable parts of a [`ZstdData`] value plus its validity.
347352
pub struct ZstdDataParts {
353+
/// Optional zstd dictionary shared by all frames.
348354
pub dictionary: Option<ByteBuffer>,
355+
/// Compressed zstd frames.
349356
pub frames: Vec<ByteBuffer>,
357+
/// Serialized frame and dictionary metadata.
350358
pub metadata: ZstdMetadata,
359+
/// Unsliced validity for the array.
351360
pub validity: Validity,
361+
/// Unsliced row count.
352362
pub n_rows: usize,
363+
/// Start of this logical slice in unsliced row coordinates.
353364
pub slice_start: usize,
365+
/// End of this logical slice in unsliced row coordinates.
354366
pub slice_stop: usize,
355367
}
356368

@@ -466,6 +478,7 @@ pub fn reconstruct_views(
466478
}
467479

468480
impl ZstdData {
481+
/// Construct unsliced zstd data from raw frames and metadata.
469482
pub fn new(
470483
dictionary: Option<ByteBuffer>,
471484
frames: Vec<ByteBuffer>,
@@ -482,6 +495,7 @@ impl ZstdData {
482495
}
483496
}
484497

498+
/// Validate dtype, slice, validity, frame, and dictionary invariants.
485499
pub fn validate(&self, dtype: &DType, len: usize, validity: &Validity) -> VortexResult<()> {
486500
vortex_ensure!(
487501
matches!(
@@ -796,6 +810,9 @@ impl ZstdData {
796810
Ok(ZstdData::new(dictionary, frames, metadata, vbv.len()))
797811
}
798812

813+
/// Compress a supported canonical array into zstd data.
814+
///
815+
/// Returns `Ok(None)` for canonical variants that this encoding does not support.
799816
pub fn from_canonical(
800817
canonical: &Canonical,
801818
level: i32,
@@ -819,6 +836,11 @@ impl ZstdData {
819836
}
820837
}
821838

839+
/// Canonicalize and compress an array into zstd data.
840+
///
841+
/// # Errors
842+
///
843+
/// Returns an error if the array's canonical form is unsupported or compression fails.
822844
pub fn from_array(
823845
array: ArrayRef,
824846
level: i32,
@@ -1019,6 +1041,7 @@ impl ZstdData {
10191041
self.slice_stop == self.slice_start
10201042
}
10211043

1044+
/// Split this data into movable parts, attaching the supplied validity.
10221045
pub fn into_parts(self, validity: Validity) -> ZstdDataParts {
10231046
ZstdDataParts {
10241047
dictionary: self.dictionary,

encodings/zstd/src/lib.rs

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,26 @@
11
// SPDX-License-Identifier: Apache-2.0
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

4+
//! Zstd-backed compression encodings for variable-width Vortex arrays.
5+
//!
6+
//! [`ZstdArray`] stores UTF-8 or binary values as one or more zstd frames, optionally sharing a
7+
//! trained dictionary across frames. Frame metadata lets slices decompress only the frames that can
8+
//! contribute values to the requested row range.
9+
//!
10+
//! With the `unstable_encodings` feature, `ZstdBuffers` stores the buffers of another encoding as
11+
//! independently compressed zstd buffers while preserving the inner encoding metadata.
12+
//!
13+
//! This crate exposes array encodings only. Compression scheme selection is wired through
14+
//! `vortex-btrblocks` and file writing. To deserialize arrays manually, register the encoding in the
15+
//! array session:
16+
//!
17+
//! ```rust
18+
//! use vortex_array::session::ArraySessionExt;
19+
//!
20+
//! let session = vortex_array::array_session();
21+
//! session.arrays().register(vortex_zstd::Zstd);
22+
//! ```
23+
424
pub use array::*;
525
#[cfg(feature = "unstable_encodings")]
626
pub use zstd_buffers::*;
@@ -16,28 +36,38 @@ mod zstd_buffers;
1636
mod test;
1737

1838
#[derive(Clone, prost::Message)]
39+
/// Metadata for one zstd frame.
1940
pub struct ZstdFrameMetadata {
41+
/// Uncompressed byte size of this frame.
2042
#[prost(uint64, tag = "1")]
2143
pub uncompressed_size: u64,
44+
/// Number of valid values stored in this frame.
2245
#[prost(uint64, tag = "2")]
2346
pub n_values: u64,
2447
}
2548

2649
#[derive(Clone, prost::Message)]
50+
/// Serialized metadata for a [`ZstdArray`].
2751
pub struct ZstdMetadata {
2852
// optional, will be 0 if there's no dictionary
53+
/// Dictionary size in bytes, or `0` when no dictionary is present.
2954
#[prost(uint32, tag = "1")]
3055
pub dictionary_size: u32,
56+
/// Metadata for each compressed frame.
3157
#[prost(message, repeated, tag = "2")]
3258
pub frames: Vec<ZstdFrameMetadata>,
3359
}
3460

3561
#[derive(Clone, prost::Message)]
62+
/// Serialized metadata for the unstable `ZstdBuffers` encoding.
3663
pub struct ZstdBuffersMetadata {
64+
/// Encoding id of the inner array whose buffers were compressed.
3765
#[prost(string, tag = "1")]
3866
pub inner_encoding_id: String,
67+
/// Serialized metadata of the inner array.
3968
#[prost(bytes = "vec", tag = "2")]
4069
pub inner_metadata: Vec<u8>,
70+
/// Uncompressed byte size of each compressed buffer.
4171
#[prost(uint64, repeated, tag = "3")]
4272
pub uncompressed_sizes: Vec<u64>,
4373
/// Alignment of each buffer in bytes (must be a power of two).

0 commit comments

Comments
 (0)