Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 41 additions & 19 deletions parquet/src/parquet_thrift.rs
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ pub(crate) enum FieldType {
Set = 10,
Map = 11,
Struct = 12,
Uuid = 13,
}

impl TryFrom<u8> for FieldType {
Expand All @@ -175,25 +176,27 @@ impl TryFrom<u8> for FieldType {
10 => Ok(Self::Set),
11 => Ok(Self::Map),
12 => Ok(Self::Struct),
13 => Ok(Self::Uuid),
_ => Err(ThriftProtocolError::InvalidFieldType(value)),
}
}
}

impl TryFrom<ElementType> for FieldType {
type Error = ThriftProtocolError;
fn try_from(value: ElementType) -> std::result::Result<Self, Self::Error> {
impl From<ElementType> for FieldType {
fn from(value: ElementType) -> Self {
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This conversion can now be infallible

match value {
ElementType::Bool => Ok(Self::BooleanTrue),
ElementType::Byte => Ok(Self::Byte),
ElementType::I16 => Ok(Self::I16),
ElementType::I32 => Ok(Self::I32),
ElementType::I64 => Ok(Self::I64),
ElementType::Double => Ok(Self::Double),
ElementType::Binary => Ok(Self::Binary),
ElementType::List => Ok(Self::List),
ElementType::Struct => Ok(Self::Struct),
_ => Err(ThriftProtocolError::InvalidFieldType(value as u8)),
ElementType::Bool => Self::BooleanTrue,
ElementType::Byte => Self::Byte,
ElementType::I16 => Self::I16,
ElementType::I32 => Self::I32,
ElementType::I64 => Self::I64,
ElementType::Double => Self::Double,
ElementType::Binary => Self::Binary,
ElementType::List => Self::List,
ElementType::Set => Self::Set,
ElementType::Map => Self::Map,
ElementType::Struct => Self::Struct,
ElementType::Uuid => Self::Uuid,
}
}
}
Expand All @@ -212,6 +215,7 @@ pub(crate) enum ElementType {
Set = 10,
Map = 11,
Struct = 12,
Uuid = 13,
}

impl TryFrom<u8> for ElementType {
Expand All @@ -234,6 +238,7 @@ impl TryFrom<u8> for ElementType {
10 => Ok(Self::Set),
11 => Ok(Self::Map),
12 => Ok(Self::Struct),
13 => Ok(Self::Uuid),
_ => Err(ThriftProtocolError::InvalidElementType(value)),
}
}
Expand Down Expand Up @@ -506,27 +511,44 @@ pub(crate) trait ThriftCompactInputProtocol<'a> {
FieldType::I64 => self.skip_vlq().map(|_| ()),
FieldType::Double => self.skip_bytes(8).map(|_| ()),
FieldType::Binary => self.skip_binary().map(|_| ()),
// see https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#struct
FieldType::Struct => {
let mut last_field_id = 0i16;
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

small drive-by improvement

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I found this confusing at first, it works because skipping does not use the field id.

loop {
let field_ident = self.read_field_begin(last_field_id)?;
// we don't need field id for skipping, so always pass 0 for last id
let field_ident = self.read_field_begin(0)?;
if field_ident.field_type == FieldType::Stop {
break;
}
self.skip_till_depth(field_ident.field_type, depth - 1)?;
last_field_id = field_ident.id;
}
Ok(())
}
FieldType::List => {
// lists and sets are encoded the same
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

// see https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#list-and-set
FieldType::List | FieldType::Set => {
let list_ident = self.read_list_begin()?;
let element_type = FieldType::from(list_ident.element_type);
for _ in 0..list_ident.size {
let element_type = FieldType::try_from(list_ident.element_type)?;
self.skip_till_depth(element_type, depth - 1)?;
}
Ok(())
}
// no list or map types in parquet format
// see https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#map
FieldType::Map => {
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A map begins with a varint size. If size is 0 it's empty. Otherwise, the next byte encodes the key and value types with 4 bits each, and then encodes size key/value pairs.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe it would help to add a link to the relevant part of the thrift encoding spec: https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#map

let size = i32::try_from(self.read_vlq()?)?;
if size > 0 {
let kv = self.read_byte()?;
let key_type = FieldType::from(ElementType::try_from(kv >> 4)?);
let val_type = FieldType::from(ElementType::try_from(kv & 0xf)?);
for _ in 0..size {
self.skip_till_depth(key_type, depth - 1)?;
self.skip_till_depth(val_type, depth - 1)?;
}
}
Ok(())
}
// see https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#universal-unique-identifier-encoding
FieldType::Uuid => self.skip_bytes(16).map(|_| ()),
_ => Err(ThriftProtocolError::SkipUnsupportedType(field_type)),
}
}
Expand Down
11 changes: 11 additions & 0 deletions parquet/tests/arrow_reader/bad_data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use arrow::util::test_util::parquet_test_data;
use bytes::Bytes;
use parquet::arrow::arrow_reader::ArrowReaderBuilder;
use parquet::errors::ParquetError;
use parquet::file::metadata::ParquetMetaDataReader;
use std::collections::HashSet;
use std::path::PathBuf;

Expand Down Expand Up @@ -175,6 +176,16 @@ fn non_standard_delta_blocks() {
}
}

#[test]
fn skip_unknown_types() {
// test file contains a FileMetaData with unknown fields with
// types not currently used by Parquet (uuid, set, map). The
// parser should be able to skip these unknown fields without
// erroring.
let file = Bytes::from_static(include_bytes!("new_types.bin"));
ParquetMetaDataReader::decode_metadata(&file).unwrap();
}

#[cfg(feature = "async")]
#[tokio::test]
#[allow(deprecated)]
Expand Down
Binary file added parquet/tests/arrow_reader/new_types.bin
Binary file not shown.
Loading