Skip to content

Commit 821f42f

Browse files
authored
Add docs for BitReader (#9948)
# Which issue does this PR close? - Related to #9372 # Rationale for this change While reviewing the ALP implementation from @sdf-jkl , I ran into this struct which I haven't really used before. - #9372 Now that I have read it, I wanted to capture that information as doc comments (for my future self and hopefully for others) # What changes are included in this PR? Add documentation comments to `BitReader` # Are these changes tested? Just docs, # Are there any user-facing changes? Just docs on an internal struct,
1 parent 86d3401 commit 821f42f

1 file changed

Lines changed: 81 additions & 28 deletions

File tree

parquet/src/util/bit_util.rs

Lines changed: 81 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -521,6 +521,15 @@ impl BitWriter {
521521
/// MAX_VLQ_BYTE_LEN = 5 for i32, and MAX_VLQ_BYTE_LEN = 10 for i64
522522
pub const MAX_VLQ_BYTE_LEN: usize = 10;
523523

524+
/// Reads bit packed values from an in-memory buffer.
525+
///
526+
/// `BitReader` is the dual of [`BitWriter`] and reads values that are either
527+
/// byte aligned or packed at arbitrary bit widths. It is primarily used by the
528+
/// Parquet RLE/bit-packing hybrid decoder.
529+
///
530+
/// Reads advance an internal cursor; once the buffer is exhausted, the
531+
/// `get_*` methods return `None` rather than panicking. To rewind, use
532+
/// [`BitReader::reset`] with the same (or a different) buffer.
524533
pub struct BitReader {
525534
/// The byte buffer to read from, passed in by client
526535
buffer: Bytes,
@@ -544,9 +553,9 @@ pub struct BitReader {
544553
bit_offset: usize,
545554
}
546555

547-
/// Utility class to read bit/byte stream. This class can read bits or bytes that are
548-
/// either byte aligned or not.
549556
impl BitReader {
557+
/// Creates a new [`BitReader`] that reads from `buffer`, starting at
558+
/// bit offset 0.
550559
pub fn new(buffer: Bytes) -> Self {
551560
BitReader {
552561
buffer,
@@ -556,22 +565,36 @@ impl BitReader {
556565
}
557566
}
558567

568+
/// Resets this reader to read from the start of `buffer`, discarding any
569+
/// previous buffer and position.
570+
///
571+
/// This is useful for reusing the same `BitReader` instance across
572+
/// multiple input buffers without allocation.
559573
pub fn reset(&mut self, buffer: Bytes) {
560574
self.buffer = buffer;
561575
self.buffered_values = 0;
562576
self.byte_offset = 0;
563577
self.bit_offset = 0;
564578
}
565579

566-
/// Gets the current byte offset
580+
/// Returns the current byte offset, rounded up to the next whole byte.
581+
///
582+
/// This is the index of the next byte that a byte-aligned
583+
/// read (such as [`BitReader::get_aligned`]) would consume.
567584
#[inline]
568585
pub fn get_byte_offset(&self) -> usize {
569586
self.byte_offset + ceil(self.bit_offset, 8)
570587
}
571588

572-
/// Reads a value of type `T` and of size `num_bits`.
589+
/// Reads a single bit-packed value of `num_bits` bits as a `T` from the
590+
/// stream.
591+
///
592+
/// The value is read as the low `num_bits` bits of `T`. Bits are consumed
593+
/// from the stream in little-endian bit order.
573594
///
574-
/// Returns `None` if there's not enough data available. `Some` otherwise.
595+
/// Returns `None` if there are fewer than `num_bits` bits left in the
596+
/// buffer; otherwise `Some(value)`. On `None` the reader's position is
597+
/// left unchanged.
575598
pub fn get_value<T: FromBitpacked>(&mut self, num_bits: usize) -> Option<T> {
576599
debug_assert!(num_bits <= 64);
577600
debug_assert!(num_bits <= size_of::<T>() * 8);
@@ -607,14 +630,21 @@ impl BitReader {
607630
Some(T::from_u64(v))
608631
}
609632

610-
/// Read multiple values from their packed representation where each element is represented
611-
/// by `num_bits` bits.
633+
/// Reads up to `batch.len()` bit-packed values of `num_bits` each, into
634+
/// `batch`.
635+
///
636+
/// Equivalent to repeatedly calling [`BitReader::get_value`] with the same
637+
/// `num_bits`, but faster because it dispatches to SIMD-friendly
638+
/// fixed-width unpacking routines whenever possible.
639+
///
640+
/// Returns the number of values actually written to `batch`. This will be
641+
/// less than `batch.len()` if the underlying buffer is exhausted before
642+
/// `batch` is filled.
612643
///
613644
/// # Panics
614645
///
615646
/// This function panics if
616647
/// - `num_bits` is larger than the bit-capacity of `T`
617-
///
618648
pub fn get_batch<T: FromBitpacked>(&mut self, batch: &mut [T], num_bits: usize) -> usize {
619649
debug_assert!(num_bits <= size_of::<T>() * 8);
620650

@@ -756,9 +786,12 @@ impl BitReader {
756786
values_to_read
757787
}
758788

759-
/// Skip num_value values with num_bits bit width
789+
/// Skips `num_values` bit-packed values of `num_bits` bits, advancing the
790+
/// reader past them without decoding.
760791
///
761-
/// Return the number of values skipped (up to num_values)
792+
/// Returns the number of values actually skipped (up to `num_values`).
793+
/// This will be less than `num_values` if the underlying buffer is
794+
/// exhausted.
762795
pub fn skip(&mut self, num_values: usize, num_bits: usize) -> usize {
763796
debug_assert!(num_bits <= 64);
764797

@@ -782,7 +815,11 @@ impl BitReader {
782815
values_to_read
783816
}
784817

785-
/// Reads up to `num_bytes` to `buf` returning the number of bytes read
818+
/// Reads up to `num_bytes` bytes from the stream, appending them to `buf`,
819+
/// and returns the number of bytes actually appended.
820+
///
821+
/// The reader is first advanced to the next byte boundary, so any
822+
/// in-progress bit-level read is discarded before the bytes are copied.
786823
pub(crate) fn get_aligned_bytes(&mut self, buf: &mut Vec<u8>, num_bytes: usize) -> usize {
787824
// Align to byte offset
788825
self.byte_offset = self.get_byte_offset();
@@ -797,13 +834,15 @@ impl BitReader {
797834
to_read
798835
}
799836

800-
/// Reads a `num_bytes`-sized value from this buffer and return it.
801-
/// `T` needs to be a little-endian native type. The value is assumed to be byte
802-
/// aligned so the bit reader will be advanced to the start of the next byte before
803-
/// reading the value.
837+
/// Reads a `num_bytes`-sized value of type `T` from the stream.
838+
///
839+
/// `T` is interpreted as a little-endian native type. The value is
840+
/// assumed to be byte aligned, so the reader is first advanced to the
841+
/// start of the next byte before reading.
804842
///
805-
/// Returns `Some` if there's enough bytes left to form a value of `T`.
806-
/// Otherwise `None`.
843+
/// Returns `Some(value)` if there are at least `num_bytes` bytes left in
844+
/// the buffer after byte-alignment, and `None` otherwise. On `None` the
845+
/// reader's byte position is still advanced to the alignment boundary.
807846
pub fn get_aligned<T: FromBytes>(&mut self, num_bytes: usize) -> Option<T> {
808847
self.byte_offset = self.get_byte_offset();
809848
self.bit_offset = 0;
@@ -819,10 +858,18 @@ impl BitReader {
819858
Some(v)
820859
}
821860

822-
/// Reads a VLQ encoded (in little endian order) int from the stream.
823-
/// The encoded int must start at the beginning of a byte.
861+
/// Reads a VLQ-encoded (in little-endian order) integer from the stream.
824862
///
825-
/// Returns `None` if there's not enough bytes in the stream. `Some` otherwise.
863+
/// The encoded integer must start at the beginning of a byte; the reader
864+
/// is first advanced to the next byte boundary before decoding.
865+
///
866+
/// Returns `Some(value)` on success, or `None` if the buffer is exhausted
867+
/// before a complete VLQ value is read.
868+
///
869+
/// # Panics
870+
///
871+
/// Panics if the encoded integer is longer than [`MAX_VLQ_BYTE_LEN`]
872+
/// bytes (bad input).
826873
pub fn get_vlq_int(&mut self) -> Option<i64> {
827874
// Align to byte boundary once, then read bytes directly
828875
self.byte_offset = self.get_byte_offset();
@@ -847,15 +894,21 @@ impl BitReader {
847894
None
848895
}
849896

850-
/// Reads a zigzag-VLQ encoded (in little endian order) int from the stream
851-
/// Zigzag-VLQ is a variant of VLQ encoding where negative and positive numbers are
852-
/// encoded in a zigzag fashion.
853-
/// See: https://developers.google.com/protocol-buffers/docs/encoding
897+
/// Reads a zigzag-VLQ-encoded little-endian integer from the
898+
/// stream.
899+
///
900+
/// Zigzag-VLQ is a variant of VLQ encoding where negative and positive
901+
/// numbers are interleaved so that small absolute values produce short
902+
/// encodings regardless of sign. See the [Protocol Buffers encoding
903+
/// documentation](https://developers.google.com/protocol-buffers/docs/encoding)
904+
/// for details.
854905
///
855-
/// Note: the encoded int must start at the beginning of a byte.
906+
/// As with [`BitReader::get_vlq_int`], the encoded integer must start at
907+
/// the beginning of a byte; the reader is first advanced to the next
908+
/// byte boundary before decoding.
856909
///
857-
/// Returns `None` if the number of bytes there's not enough bytes in the stream.
858-
/// `Some` otherwise.
910+
/// Returns `Some(value)` on success, or `None` if the buffer is exhausted
911+
/// before a complete value is read.
859912
#[inline]
860913
pub fn get_zigzag_vlq_int(&mut self) -> Option<i64> {
861914
self.get_vlq_int().map(|v| {
@@ -864,7 +917,7 @@ impl BitReader {
864917
})
865918
}
866919

867-
/// Loads up to the the next 8 bytes from `self.buffer` at `self.byte_offset`
920+
/// Loads up to the next 8 bytes from `self.buffer` at `self.byte_offset`
868921
/// into `self.buffered_values`.
869922
///
870923
/// Reads fewer than 8 bytes if there are fewer than 8 bytes left

0 commit comments

Comments
 (0)