Skip to content

Commit 65993cd

Browse files
authored
GH-49434: [C++][CI] Add golden integration files to IPC file fuzz corpus (#49440)
### Rationale for this change For the IPC stream fuzzer, we are adding the golden integration files to the seed corpus, but we are currently not doing the same thing for the file fuzzer. ### What changes are included in this PR? 1. Add golden IPC integration files to the IPC file fuzzer seed corpus 2. Minor cosmetic changes to the C++ test script, to make the output slightly less bulky (no functional difference) ### Are these changes tested? Yes, by existing CI tests. ### Are there any user-facing changes? No. * GitHub Issue: #49434 Authored-by: Antoine Pitrou <antoine@python.org> Signed-off-by: Antoine Pitrou <antoine@python.org>
1 parent d42c7dd commit 65993cd

3 files changed

Lines changed: 24 additions & 21 deletions

File tree

ci/scripts/cpp_test.sh

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -191,16 +191,19 @@ if [ "${ARROW_FUZZING}" == "ON" ]; then
191191
export ASAN_OPTIONS="$ASAN_OPTIONS allocator_may_return_null=1"
192192

193193
# 1. Generate seed corpuses
194+
# For IPC fuzz targets, these will include the golden IPC integration files.
194195
"${source_dir}/build-support/fuzzing/generate_corpuses.sh" "${binary_output_dir}"
195196

196197
# 2. Run fuzz targets on seed corpus entries
197198
function run_fuzz_target_on_seed_corpus() {
198199
fuzz_target_basename=$1
199200
corpus_dir=${binary_output_dir}/${fuzz_target_basename}_seed_corpus
200201
mkdir -p "${corpus_dir}"
201-
rm -f "${corpus_dir}"/*
202-
unzip "${binary_output_dir}"/"${fuzz_target_basename}"_seed_corpus.zip -d "${corpus_dir}"
203-
"${binary_output_dir}"/"${fuzz_target_basename}" -rss_limit_mb=4000 "${corpus_dir}"/*
202+
pushd "${corpus_dir}"
203+
unzip -q "${binary_output_dir}"/"${fuzz_target_basename}"_seed_corpus.zip -d .
204+
"${binary_output_dir}"/"${fuzz_target_basename}" -rss_limit_mb=4000 ./*
205+
popd
206+
rm -rf "${corpus_dir}"
204207
}
205208
run_fuzz_target_on_seed_corpus arrow-csv-fuzz
206209
run_fuzz_target_on_seed_corpus arrow-ipc-file-fuzz
@@ -212,22 +215,17 @@ if [ "${ARROW_FUZZING}" == "ON" ]; then
212215
fi
213216

214217
# 3. Run fuzz targets on regression files from arrow-testing
215-
# Run golden IPC integration files: these should ideally load without errors,
216-
# though some very old ones carry invalid data (such as decimal values
217-
# larger than their advertised precision).
218-
# shellcheck disable=SC2046
219-
"${binary_output_dir}/arrow-ipc-stream-fuzz" $(find "${ARROW_TEST_DATA}"/arrow-ipc-stream/integration -name "*.stream")
220-
# shellcheck disable=SC2046
221-
"${binary_output_dir}/arrow-ipc-file-fuzz" $(find "${ARROW_TEST_DATA}"/arrow-ipc-stream/integration -name "*.arrow_file")
222-
# Run known crash files
223-
"${binary_output_dir}/arrow-ipc-stream-fuzz" "${ARROW_TEST_DATA}"/arrow-ipc-stream/crash-*
224-
"${binary_output_dir}/arrow-ipc-stream-fuzz" "${ARROW_TEST_DATA}"/arrow-ipc-stream/*-testcase-*
225-
"${binary_output_dir}/arrow-ipc-file-fuzz" "${ARROW_TEST_DATA}"/arrow-ipc-file/*-testcase-*
226-
"${binary_output_dir}/arrow-ipc-tensor-stream-fuzz" "${ARROW_TEST_DATA}"/arrow-ipc-tensor-stream/*-testcase-*
218+
pushd "${ARROW_TEST_DATA}"
219+
"${binary_output_dir}/arrow-ipc-stream-fuzz" arrow-ipc-stream/crash-*
220+
"${binary_output_dir}/arrow-ipc-stream-fuzz" arrow-ipc-stream/*-testcase-*
221+
"${binary_output_dir}/arrow-ipc-file-fuzz" arrow-ipc-file/*-testcase-*
222+
"${binary_output_dir}/arrow-ipc-tensor-stream-fuzz" arrow-ipc-tensor-stream/*-testcase-*
227223
if [ "${ARROW_PARQUET}" == "ON" ]; then
228-
"${binary_output_dir}/parquet-arrow-fuzz" "${ARROW_TEST_DATA}"/parquet/fuzzing/*-testcase-*
224+
"${binary_output_dir}/parquet-arrow-fuzz" parquet/fuzzing/*-testcase-*
225+
# TODO replay encoding regression files when we have some
229226
fi
230-
"${binary_output_dir}/arrow-csv-fuzz" "${ARROW_TEST_DATA}"/csv/fuzzing/*-testcase-*
227+
"${binary_output_dir}/arrow-csv-fuzz" csv/fuzzing/*-testcase-*
228+
popd
231229
fi
232230

233231
popd

cpp/build-support/fuzzing/generate_corpuses.sh

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ set -ex
2929
CORPUS_DIR=/tmp/corpus
3030
PANDAS_DIR=/tmp/pandas
3131

32-
ARROW_ROOT=$(cd $(dirname $BASH_SOURCE)/../../..; pwd)
32+
ARROW_ROOT=$(cd $(dirname "$BASH_SOURCE")/../../..; pwd)
3333
ARROW_CPP=$ARROW_ROOT/cpp
3434
OUT=$1
3535

@@ -39,17 +39,21 @@ OUT=$1
3939

4040
# Arrow IPC
4141

42-
IPC_INTEGRATION_FILES=$(find ${ARROW_ROOT}/testing/data/arrow-ipc-stream/integration -name "*.stream")
43-
4442
rm -rf ${CORPUS_DIR}
4543
${OUT}/arrow-ipc-generate-fuzz-corpus -stream ${CORPUS_DIR}
44+
# Add "golden" IPC integration files
45+
IPC_INTEGRATION_FILES=$(find ${ARROW_ROOT}/testing/data/arrow-ipc-stream/integration -name "*.stream")
46+
[ -z "${IPC_INTEGRATION_FILES}" ] && exit 1
4647
# Several IPC integration files can have the same name, make sure
4748
# they all appear in the corpus by numbering the duplicates.
4849
cp --backup=numbered ${IPC_INTEGRATION_FILES} ${CORPUS_DIR}
4950
${ARROW_CPP}/build-support/fuzzing/pack_corpus.py ${CORPUS_DIR} ${OUT}/arrow-ipc-stream-fuzz_seed_corpus.zip
5051

5152
rm -rf ${CORPUS_DIR}
5253
${OUT}/arrow-ipc-generate-fuzz-corpus -file ${CORPUS_DIR}
54+
IPC_INTEGRATION_FILES=$(find ${ARROW_ROOT}/testing/data/arrow-ipc-stream/integration -name "*.arrow_file")
55+
[ -z "${IPC_INTEGRATION_FILES}" ] && exit 1
56+
cp --backup=numbered ${IPC_INTEGRATION_FILES} ${CORPUS_DIR}
5357
${ARROW_CPP}/build-support/fuzzing/pack_corpus.py ${CORPUS_DIR} ${OUT}/arrow-ipc-file-fuzz_seed_corpus.zip
5458

5559
rm -rf ${CORPUS_DIR}

cpp/src/parquet/arrow/fuzz_encoding_internal.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -467,7 +467,8 @@ Status FuzzEncoding(const uint8_t* data, int64_t size) {
467467

468468
ARROW_ASSIGN_OR_RAISE(const auto parse_result,
469469
FuzzEncodingHeader::Parse(std::span(data, size)));
470-
auto& [header, encoded_data] = parse_result;
470+
const auto header = parse_result.first;
471+
const auto encoded_data = parse_result.second;
471472
if (encoded_data.size() > static_cast<size_t>(kInt32Max)) {
472473
// Unlikely but who knows?
473474
return Status::Invalid("Fuzz payload too large");

0 commit comments

Comments
 (0)