Skip to content

Commit 1ffd202

Browse files
authored
Remove deprecated parquet::format module and thrift dependency (#9962)
# Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. --> - Closes #9953. # Rationale for this change Removal of the `parquet::format` was planned for the 59.0 Release (#9110) according to code comments. There is also now a security advisory against the Apache Thrift Rust implementation (GHSA-2f9f-gq7v-9h6m), for which there is no fixed release yet on `crates.io`. <!-- Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed. Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes. --> # What changes are included in this PR? - Removal of the Apache Thrift dependency - Removal of the deprecated `parquet::format` module - Changes to the `parquet-layout` binary to remove printing of page details, since that still depended on the deprecated code # Are these changes tested? Existing tests pass as the code was unused. # Are there any user-facing changes? Breaking api change, since `parquet::format` was a public module. The output of the `parquet-layout` binary changes.
1 parent 48fa8a7 commit 1ffd202

7 files changed

Lines changed: 20 additions & 6305 deletions

File tree

Cargo.lock

Lines changed: 0 additions & 27 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

parquet/Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,6 @@ parquet-variant-compute = { workspace = true, optional = true }
5252
object_store = { workspace = true, optional = true, features = ["tokio"] }
5353

5454
bytes = { version = "1.1", default-features = false, features = ["std"] }
55-
thrift = { version = "0.17", default-features = false }
5655
snap = { version = "1.0", default-features = false, optional = true }
5756
brotli = { version = "8.0", default-features = false, features = ["std"], optional = true }
5857
# To use `flate2` you must enable either the `flate2-zlib-rs` or `flate2-rust_backened` backends

parquet/src/bin/parquet-layout.rs

Lines changed: 19 additions & 124 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,7 @@
1717

1818
//! Binary that prints the physical layout of a parquet file
1919
//!
20-
//! NOTE: due to this binary's use of the deprecated [`parquet::format`] module, it
21-
//! will no longer be maintained, and will likely be removed in the future.
22-
//! Alternatives to this include [`parquet-cli`] and [`parquet-viewer`].
20+
//! Alternatives to this binary include [`parquet-cli`] and [`parquet-viewer`].
2321
//!
2422
//! # Install
2523
//!
@@ -41,19 +39,14 @@
4139
//! [`parquet-viewer`]: https://github.com/xiangpenghao/parquet-viewer
4240
4341
use std::fs::File;
44-
use std::io::Read;
4542

4643
use clap::Parser;
4744
use parquet::file::metadata::ParquetMetaDataReader;
48-
use serde::Serialize;
49-
use thrift::protocol::TCompactInputProtocol;
45+
use serde::{Serialize, Serializer};
5046

51-
use parquet::basic::CompressionCodec;
47+
use parquet::basic::{CompressionCodec, Encoding};
5248
use parquet::errors::Result;
5349
use parquet::file::reader::ChunkReader;
54-
#[allow(deprecated)]
55-
use parquet::format::PageHeader;
56-
use parquet::thrift::TSerializable;
5750

5851
#[derive(Serialize, Debug)]
5952
struct Index {
@@ -87,22 +80,22 @@ struct ColumnChunk {
8780
offset_index: Option<Index>,
8881
column_index: Option<Index>,
8982
bloom_filter: Option<Index>,
90-
pages: Vec<Page>,
83+
compression: DebugSerialize<CompressionCodec>,
84+
encodings: Vec<DebugSerialize<Encoding>>,
9185
}
9286

93-
#[derive(Serialize, Debug)]
94-
struct Page {
95-
compression: Option<&'static str>,
96-
encoding: &'static str,
97-
page_type: &'static str,
98-
offset: u64,
99-
compressed_bytes: i32,
100-
uncompressed_bytes: i32,
101-
header_bytes: i32,
102-
num_values: i32,
87+
#[derive(Debug)]
88+
struct DebugSerialize<T: std::fmt::Debug>(T);
89+
90+
impl<T: std::fmt::Debug> Serialize for DebugSerialize<T> {
91+
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
92+
where
93+
S: Serializer,
94+
{
95+
serializer.serialize_str(&format!("{:?}", &self.0))
96+
}
10397
}
10498

105-
#[allow(deprecated)]
10699
fn do_layout<C: ChunkReader>(reader: &C) -> Result<ParquetFile> {
107100
let mut metadata_reader = ParquetMetaDataReader::new();
108101
metadata_reader.try_parse(reader)?;
@@ -118,55 +111,8 @@ fn do_layout<C: ChunkReader>(reader: &C) -> Result<ParquetFile> {
118111
.iter()
119112
.zip(schema.columns())
120113
.map(|(column, column_schema)| {
121-
let compression = compression(column.compression_codec());
122-
let mut pages = vec![];
123-
124-
let mut start = column
125-
.dictionary_page_offset()
126-
.unwrap_or_else(|| column.data_page_offset())
127-
as u64;
128-
129-
let end = start + column.compressed_size() as u64;
130-
while start != end {
131-
let (header_len, header) = read_page_header(reader, start)?;
132-
if let Some(dictionary) = header.dictionary_page_header {
133-
pages.push(Page {
134-
compression,
135-
encoding: encoding(dictionary.encoding.0),
136-
page_type: "dictionary",
137-
offset: start,
138-
compressed_bytes: header.compressed_page_size,
139-
uncompressed_bytes: header.uncompressed_page_size,
140-
header_bytes: header_len as _,
141-
num_values: dictionary.num_values,
142-
})
143-
} else if let Some(data_page) = header.data_page_header {
144-
pages.push(Page {
145-
compression,
146-
encoding: encoding(data_page.encoding.0),
147-
page_type: "data_page_v1",
148-
offset: start,
149-
compressed_bytes: header.compressed_page_size,
150-
uncompressed_bytes: header.uncompressed_page_size,
151-
header_bytes: header_len as _,
152-
num_values: data_page.num_values,
153-
})
154-
} else if let Some(data_page) = header.data_page_header_v2 {
155-
let is_compressed = data_page.is_compressed.unwrap_or(true);
156-
157-
pages.push(Page {
158-
compression: compression.filter(|_| is_compressed),
159-
encoding: encoding(data_page.encoding.0),
160-
page_type: "data_page_v2",
161-
offset: start,
162-
compressed_bytes: header.compressed_page_size,
163-
uncompressed_bytes: header.uncompressed_page_size,
164-
header_bytes: header_len as _,
165-
num_values: data_page.num_values,
166-
})
167-
}
168-
start += header.compressed_page_size as u64 + header_len as u64;
169-
}
114+
let compression = DebugSerialize(column.compression_codec());
115+
let encodings = column.encodings().map(DebugSerialize).collect();
170116

171117
Ok(ColumnChunk {
172118
path: column_schema.path().parts().join("."),
@@ -185,7 +131,8 @@ fn do_layout<C: ChunkReader>(reader: &C) -> Result<ParquetFile> {
185131
offset,
186132
length: column.bloom_filter_length(),
187133
}),
188-
pages,
134+
compression,
135+
encodings,
189136
})
190137
})
191138
.collect::<Result<Vec<_>>>()?;
@@ -203,58 +150,6 @@ fn do_layout<C: ChunkReader>(reader: &C) -> Result<ParquetFile> {
203150
})
204151
}
205152

206-
/// Reads the page header at `offset` from `reader`, returning
207-
/// both the `PageHeader` and its length in bytes
208-
#[allow(deprecated)]
209-
fn read_page_header<C: ChunkReader>(reader: &C, offset: u64) -> Result<(usize, PageHeader)> {
210-
struct TrackedRead<R>(R, usize);
211-
212-
impl<R: Read> Read for TrackedRead<R> {
213-
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
214-
let v = self.0.read(buf)?;
215-
self.1 += v;
216-
Ok(v)
217-
}
218-
}
219-
220-
let input = reader.get_read(offset)?;
221-
let mut tracked = TrackedRead(input, 0);
222-
let mut prot = TCompactInputProtocol::new(&mut tracked);
223-
let header = PageHeader::read_from_in_protocol(&mut prot)?;
224-
Ok((tracked.1, header))
225-
}
226-
227-
/// Returns a string representation for a given compression
228-
fn compression(compression: CompressionCodec) -> Option<&'static str> {
229-
match compression {
230-
CompressionCodec::UNCOMPRESSED => None,
231-
CompressionCodec::SNAPPY => Some("snappy"),
232-
CompressionCodec::GZIP => Some("gzip"),
233-
CompressionCodec::LZO => Some("lzo"),
234-
CompressionCodec::BROTLI => Some("brotli"),
235-
CompressionCodec::LZ4 => Some("lz4"),
236-
CompressionCodec::ZSTD => Some("zstd"),
237-
CompressionCodec::LZ4_RAW => Some("lz4_raw"),
238-
}
239-
}
240-
241-
/// Returns a string representation for a given encoding
242-
fn encoding(encoding: i32) -> &'static str {
243-
match encoding {
244-
0 => "plain",
245-
2 => "plain_dictionary",
246-
3 => "rle",
247-
#[allow(deprecated)]
248-
4 => "bit_packed",
249-
5 => "delta_binary_packed",
250-
6 => "delta_length_byte_array",
251-
7 => "delta_byte_array",
252-
8 => "rle_dictionary",
253-
9 => "byte_stream_split",
254-
_ => "unknown",
255-
}
256-
}
257-
258153
#[derive(Debug, Parser)]
259154
#[clap(author, version, about("Prints the physical layout of a parquet file"), long_about = None)]
260155
struct Args {

parquet/src/errors.rs

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -108,12 +108,6 @@ impl From<snap::Error> for ParquetError {
108108
}
109109
}
110110

111-
impl From<thrift::Error> for ParquetError {
112-
fn from(e: thrift::Error) -> ParquetError {
113-
ParquetError::External(Box::new(e))
114-
}
115-
}
116-
117111
impl From<cell::BorrowMutError> for ParquetError {
118112
fn from(e: cell::BorrowMutError) -> ParquetError {
119113
ParquetError::External(Box::new(e))

0 commit comments

Comments
 (0)