Skip to content

Commit 797d730

Browse files
PERF: Skip header check in parquet reader (rapidsai#22679)
The parquet reader previously required three reads to read the parquet footer: 1. A 4 byte read to check the header for the parquet magic bytes 2. An 8 byte read to read the footer length and footer parquet magic bytes 3. A varaible-length read for the footer metadata We don't really care about ensuring that the header is valid. For high-latency storage, it's not worth the extra read. Part of rapidsai#22668, which also proposes to remove the second 8-byte read. But this is a smaller change that should be less controversial. Authors: - Tom Augspurger (https://github.com/TomAugspurger) Approvers: - Bradley Dice (https://github.com/bdice) - Vukasin Milovanovic (https://github.com/vuule) URL: rapidsai#22679
1 parent ef0a96d commit 797d730

2 files changed

Lines changed: 24 additions & 5 deletions

File tree

cpp/src/io/parquet/io_utils/parquet_io_utils.cpp

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,10 @@ std::unique_ptr<cudf::io::datasource::buffer> fetch_footer_to_host(cudf::io::dat
3737
constexpr auto ender_len = sizeof(file_ender_s);
3838
size_t const len = datasource.size();
3939

40-
auto header_buffer = datasource.host_read(0, header_len);
41-
auto const header = reinterpret_cast<file_header_s const*>(header_buffer->data());
42-
auto ender_buffer = datasource.host_read(len - ender_len, ender_len);
43-
auto const ender = reinterpret_cast<file_ender_s const*>(ender_buffer->data());
4440
CUDF_EXPECTS(len > header_len + ender_len, "Incorrect data source");
45-
CUDF_EXPECTS(header->magic == detail::parquet_magic, "Corrupted header");
41+
42+
auto ender_buffer = datasource.host_read(len - ender_len, ender_len);
43+
auto const ender = reinterpret_cast<file_ender_s const*>(ender_buffer->data());
4644
CUDF_EXPECTS(ender->magic == detail::parquet_magic, "Corrupted footer");
4745
CUDF_EXPECTS(ender->footer_len != 0 && ender->footer_len <= (len - header_len - ender_len),
4846
"Incorrect footer length");

cpp/tests/io/parquet_reader_test.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4375,6 +4375,27 @@ TEST_F(ParquetReaderTest, LateBindSourceInfo)
43754375
CUDF_TEST_EXPECT_TABLES_EQUAL(result.tbl->view(), expected->view());
43764376
}
43774377

4378+
TEST_F(ParquetReaderTest, InvalidFooterMagic)
4379+
{
4380+
auto const expected = create_random_fixed_table<int>(4, 4, false);
4381+
4382+
std::vector<char> buffer;
4383+
cudf::io::write_parquet(
4384+
cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&buffer}, *expected));
4385+
4386+
constexpr std::array<char, 4> bad_magic{'B', 'A', 'D', '!'};
4387+
ASSERT_GE(buffer.size(), bad_magic.size());
4388+
for (size_t i = 0; i < bad_magic.size(); ++i) {
4389+
buffer[buffer.size() - bad_magic.size() + i] = bad_magic[i];
4390+
}
4391+
4392+
auto const read_opts = cudf::io::parquet_reader_options::builder(
4393+
cudf::io::source_info{cudf::host_span<std::byte const>{
4394+
reinterpret_cast<std::byte const*>(buffer.data()), buffer.size()}})
4395+
.build();
4396+
EXPECT_THROW(cudf::io::read_parquet(read_opts), cudf::logic_error);
4397+
}
4398+
43784399
TEST_F(ParquetReaderTest, DecimalTypeOption)
43794400
{
43804401
auto const data = std::vector<int32_t>{1000, 2000, 3000, 4000, 5000};

0 commit comments

Comments
 (0)