@@ -521,6 +521,15 @@ impl BitWriter {
521521/// MAX_VLQ_BYTE_LEN = 5 for i32, and MAX_VLQ_BYTE_LEN = 10 for i64
522522pub const MAX_VLQ_BYTE_LEN : usize = 10 ;
523523
524+ /// Reads bit packed values from an in-memory buffer.
525+ ///
526+ /// `BitReader` is the dual of [`BitWriter`] and reads values that are either
527+ /// byte aligned or packed at arbitrary bit widths. It is primarily used by the
528+ /// Parquet RLE/bit-packing hybrid decoder.
529+ ///
530+ /// Reads advance an internal cursor; once the buffer is exhausted, the
531+ /// `get_*` methods return `None` rather than panicking. To rewind, use
532+ /// [`BitReader::reset`] with the same (or a different) buffer.
524533pub struct BitReader {
525534 /// The byte buffer to read from, passed in by client
526535 buffer : Bytes ,
@@ -544,9 +553,9 @@ pub struct BitReader {
544553 bit_offset : usize ,
545554}
546555
547- /// Utility class to read bit/byte stream. This class can read bits or bytes that are
548- /// either byte aligned or not.
549556impl BitReader {
557+ /// Creates a new [`BitReader`] that reads from `buffer`, starting at
558+ /// bit offset 0.
550559 pub fn new ( buffer : Bytes ) -> Self {
551560 BitReader {
552561 buffer,
@@ -556,22 +565,36 @@ impl BitReader {
556565 }
557566 }
558567
568+ /// Resets this reader to read from the start of `buffer`, discarding any
569+ /// previous buffer and position.
570+ ///
571+ /// This is useful for reusing the same `BitReader` instance across
572+ /// multiple input buffers without allocation.
559573 pub fn reset ( & mut self , buffer : Bytes ) {
560574 self . buffer = buffer;
561575 self . buffered_values = 0 ;
562576 self . byte_offset = 0 ;
563577 self . bit_offset = 0 ;
564578 }
565579
566- /// Gets the current byte offset
580+ /// Returns the current byte offset, rounded up to the next whole byte.
581+ ///
582+ /// This is the index of the next byte that a byte-aligned
583+ /// read (such as [`BitReader::get_aligned`]) would consume.
567584 #[ inline]
568585 pub fn get_byte_offset ( & self ) -> usize {
569586 self . byte_offset + ceil ( self . bit_offset , 8 )
570587 }
571588
572- /// Reads a value of type `T` and of size `num_bits`.
589+ /// Reads a single bit-packed value of `num_bits` bits as a `T` from the
590+ /// stream.
591+ ///
592+ /// The value is read as the low `num_bits` bits of `T`. Bits are consumed
593+ /// from the stream in little-endian bit order.
573594 ///
574- /// Returns `None` if there's not enough data available. `Some` otherwise.
595+ /// Returns `None` if there are fewer than `num_bits` bits left in the
596+ /// buffer; otherwise `Some(value)`. On `None` the reader's position is
597+ /// left unchanged.
575598 pub fn get_value < T : FromBitpacked > ( & mut self , num_bits : usize ) -> Option < T > {
576599 debug_assert ! ( num_bits <= 64 ) ;
577600 debug_assert ! ( num_bits <= size_of:: <T >( ) * 8 ) ;
@@ -607,14 +630,21 @@ impl BitReader {
607630 Some ( T :: from_u64 ( v) )
608631 }
609632
610- /// Read multiple values from their packed representation where each element is represented
611- /// by `num_bits` bits.
633+ /// Reads up to `batch.len()` bit-packed values of `num_bits` each, into
634+ /// `batch`.
635+ ///
636+ /// Equivalent to repeatedly calling [`BitReader::get_value`] with the same
637+ /// `num_bits`, but faster because it dispatches to SIMD-friendly
638+ /// fixed-width unpacking routines whenever possible.
639+ ///
640+ /// Returns the number of values actually written to `batch`. This will be
641+ /// less than `batch.len()` if the underlying buffer is exhausted before
642+ /// `batch` is filled.
612643 ///
613644 /// # Panics
614645 ///
615646 /// This function panics if
616647 /// - `num_bits` is larger than the bit-capacity of `T`
617- ///
618648 pub fn get_batch < T : FromBitpacked > ( & mut self , batch : & mut [ T ] , num_bits : usize ) -> usize {
619649 debug_assert ! ( num_bits <= size_of:: <T >( ) * 8 ) ;
620650
@@ -756,9 +786,12 @@ impl BitReader {
756786 values_to_read
757787 }
758788
759- /// Skip num_value values with num_bits bit width
789+ /// Skips `num_values` bit-packed values of `num_bits` bits, advancing the
790+ /// reader past them without decoding.
760791 ///
761- /// Return the number of values skipped (up to num_values)
792+ /// Returns the number of values actually skipped (up to `num_values`).
793+ /// This will be less than `num_values` if the underlying buffer is
794+ /// exhausted.
762795 pub fn skip ( & mut self , num_values : usize , num_bits : usize ) -> usize {
763796 debug_assert ! ( num_bits <= 64 ) ;
764797
@@ -782,7 +815,11 @@ impl BitReader {
782815 values_to_read
783816 }
784817
785- /// Reads up to `num_bytes` to `buf` returning the number of bytes read
818+ /// Reads up to `num_bytes` bytes from the stream, appending them to `buf`,
819+ /// and returns the number of bytes actually appended.
820+ ///
821+ /// The reader is first advanced to the next byte boundary, so any
822+ /// in-progress bit-level read is discarded before the bytes are copied.
786823 pub ( crate ) fn get_aligned_bytes ( & mut self , buf : & mut Vec < u8 > , num_bytes : usize ) -> usize {
787824 // Align to byte offset
788825 self . byte_offset = self . get_byte_offset ( ) ;
@@ -797,13 +834,15 @@ impl BitReader {
797834 to_read
798835 }
799836
800- /// Reads a `num_bytes`-sized value from this buffer and return it.
801- /// `T` needs to be a little-endian native type. The value is assumed to be byte
802- /// aligned so the bit reader will be advanced to the start of the next byte before
803- /// reading the value.
837+ /// Reads a `num_bytes`-sized value of type `T` from the stream.
838+ ///
839+ /// `T` is interpreted as a little-endian native type. The value is
840+ /// assumed to be byte aligned, so the reader is first advanced to the
841+ /// start of the next byte before reading.
804842 ///
805- /// Returns `Some` if there's enough bytes left to form a value of `T`.
806- /// Otherwise `None`.
843+ /// Returns `Some(value)` if there are at least `num_bytes` bytes left in
844+ /// the buffer after byte-alignment, and `None` otherwise. On `None` the
845+ /// reader's byte position is still advanced to the alignment boundary.
807846 pub fn get_aligned < T : FromBytes > ( & mut self , num_bytes : usize ) -> Option < T > {
808847 self . byte_offset = self . get_byte_offset ( ) ;
809848 self . bit_offset = 0 ;
@@ -819,10 +858,18 @@ impl BitReader {
819858 Some ( v)
820859 }
821860
822- /// Reads a VLQ encoded (in little endian order) int from the stream.
823- /// The encoded int must start at the beginning of a byte.
861+ /// Reads a VLQ-encoded (in little-endian order) integer from the stream.
824862 ///
825- /// Returns `None` if there's not enough bytes in the stream. `Some` otherwise.
863+ /// The encoded integer must start at the beginning of a byte; the reader
864+ /// is first advanced to the next byte boundary before decoding.
865+ ///
866+ /// Returns `Some(value)` on success, or `None` if the buffer is exhausted
867+ /// before a complete VLQ value is read.
868+ ///
869+ /// # Panics
870+ ///
871+ /// Panics if the encoded integer is longer than [`MAX_VLQ_BYTE_LEN`]
872+ /// bytes (bad input).
826873 pub fn get_vlq_int ( & mut self ) -> Option < i64 > {
827874 // Align to byte boundary once, then read bytes directly
828875 self . byte_offset = self . get_byte_offset ( ) ;
@@ -847,15 +894,21 @@ impl BitReader {
847894 None
848895 }
849896
850- /// Reads a zigzag-VLQ encoded (in little endian order) int from the stream
851- /// Zigzag-VLQ is a variant of VLQ encoding where negative and positive numbers are
852- /// encoded in a zigzag fashion.
853- /// See: https://developers.google.com/protocol-buffers/docs/encoding
897+ /// Reads a zigzag-VLQ-encoded little-endian integer from the
898+ /// stream.
899+ ///
900+ /// Zigzag-VLQ is a variant of VLQ encoding where negative and positive
901+ /// numbers are interleaved so that small absolute values produce short
902+ /// encodings regardless of sign. See the [Protocol Buffers encoding
903+ /// documentation](https://developers.google.com/protocol-buffers/docs/encoding)
904+ /// for details.
854905 ///
855- /// Note: the encoded int must start at the beginning of a byte.
906+ /// As with [`BitReader::get_vlq_int`], the encoded integer must start at
907+ /// the beginning of a byte; the reader is first advanced to the next
908+ /// byte boundary before decoding.
856909 ///
857- /// Returns `None` if the number of bytes there's not enough bytes in the stream.
858- /// `Some` otherwise .
910+ /// Returns `Some(value)` on success, or `None` if the buffer is exhausted
911+ /// before a complete value is read .
859912 #[ inline]
860913 pub fn get_zigzag_vlq_int ( & mut self ) -> Option < i64 > {
861914 self . get_vlq_int ( ) . map ( |v| {
@@ -864,7 +917,7 @@ impl BitReader {
864917 } )
865918 }
866919
867- /// Loads up to the the next 8 bytes from `self.buffer` at `self.byte_offset`
920+ /// Loads up to the next 8 bytes from `self.buffer` at `self.byte_offset`
868921 /// into `self.buffered_values`.
869922 ///
870923 /// Reads fewer than 8 bytes if there are fewer than 8 bytes left
0 commit comments