diff --git a/.gitignore b/.gitignore index b33bc7d24112c..5f9bd94ebf6c2 100644 --- a/.gitignore +++ b/.gitignore @@ -68,3 +68,4 @@ uv.lock /plots/ /test/testdrive/types.parquet* /test/mz-deploy/**/target/ +target-fuzz/ diff --git a/Cargo.toml b/Cargo.toml index aa43948698d0f..d9d7f707265cc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -260,6 +260,11 @@ exclude = [ "misc/wasm/*", # Ignore any Rust dependencies that python packages might pull in. "misc/python/venv/*", + # The `src/*/fuzz` cargo-fuzz crates need no entry here: each sets + # `package.workspace = "../../../test/cargo-fuzz"`, attaching it to the fuzz + # workspace, and a crate nested under a workspace member is never + # auto-included in the root workspace. They build on a nightly toolchain + # (libFuzzer) via `cargo +nightly fuzz run ...` or `ci/test/cargo-fuzz.sh`. ] # Use Cargo's new feature resolver, which can handle target-specific features, diff --git a/ci/builder/Dockerfile b/ci/builder/Dockerfile index 5fdf51503bf5b..5c640392573e0 100644 --- a/ci/builder/Dockerfile +++ b/ci/builder/Dockerfile @@ -247,6 +247,14 @@ RUN mkdir rust \ && cargo install --root /usr/local --version "=0.1.61" --locked --features=vendored-openssl cargo-udeps \ && cargo install --root /usr/local --version "=0.4.0" --locked cargo-binutils \ && cargo install --root /usr/local --version "=0.13.1" --locked wasm-pack \ + && if [ "$RUST_VERSION" = "nightly" ]; then \ + # NOTE: no --locked, unlike the installs above. cargo-fuzz 0.13.1's \ + # bundled Cargo.lock pins deps that fail on the pinned nightly \ + # (yanked futures-util/zip, plus a crate using the perma-unstable \ + # `rustc_layout_scalar_valid_range_*` attribute). Let cargo resolve \ + # compatible versions instead. \ + cargo install --root /usr/local --version "=0.13.1" cargo-fuzz; \ + fi \ && rm -rf /cargo/registry /cargo/git # Shims for sanitizers diff --git a/ci/plugins/mzcompose/hooks/command b/ci/plugins/mzcompose/hooks/command index 378109bc98930..5bd127649dc62 100644 --- a/ci/plugins/mzcompose/hooks/command +++ b/ci/plugins/mzcompose/hooks/command @@ -353,6 +353,7 @@ cleanup() { killall -9 -q clusterd || true # There might be remaining processes from a cargo-test run if [ ! -s services.log ] \ + && [ "$BUILDKITE_LABEL" != ":rust: cargo-fuzz" ] \ && [ "$BUILDKITE_LABEL" != "Maelstrom coverage of persist" ] \ && [ "$BUILDKITE_LABEL" != "Long single-node Maelstrom coverage of persist" ] \ && [ "$BUILDKITE_LABEL" != "Maelstrom coverage of txn-wal" ] \ diff --git a/ci/plugins/mzcompose/plugin.yml b/ci/plugins/mzcompose/plugin.yml index 421eb4a1c437a..c73265c928e88 100644 --- a/ci/plugins/mzcompose/plugin.yml +++ b/ci/plugins/mzcompose/plugin.yml @@ -21,5 +21,7 @@ configuration: type: string composition: type: string + ci_builder: + type: string required: ["composition"] additionalProperties: false diff --git a/ci/release-qualification/pipeline.template.yml b/ci/release-qualification/pipeline.template.yml index 20477e6497491..8f9fde004cf7b 100644 --- a/ci/release-qualification/pipeline.template.yml +++ b/ci/release-qualification/pipeline.template.yml @@ -254,6 +254,29 @@ steps: composition: sqlsmith args: [--max-joins=15, --explain-only, --runtime=6000] + - id: cargo-fuzz + label: ":rust: cargo-fuzz" + depends_on: [] + timeout_in_minutes: 1440 + agents: + queue: hetzner-x86-64-dedi-48cpu-192gb + sanitizer: skip + plugins: + - ./ci/plugins/mzcompose: + composition: cargo-fuzz + ci_builder: nightly + args: + - --profile=fruitful + - --max-seconds=86400 + - --wall-budget=84600 + # Step hard-times out at 1440min (86400s). --wall-budget ends fuzzing + # at 84600s, leaving 1800s; cap minimize at 1200s so the corpus + # upload has ~600s of headroom before the kill. + - --minimize-timeout=1200 + - --corpus-sync + artifact_paths: + - src/*/fuzz/artifacts/**/* + - id: test-preflight-check-rollback label: Test with preflight check and rollback depends_on: [] diff --git a/ci/test/lint-buf/generate-buf-config.py b/ci/test/lint-buf/generate-buf-config.py index e891682ad2ae1..7a97ef8a322bf 100755 --- a/ci/test/lint-buf/generate-buf-config.py +++ b/ci/test/lint-buf/generate-buf-config.py @@ -18,6 +18,11 @@ SOURCE_DIR = "src/" PROTO_FILE_GLOB = f"{SOURCE_DIR}**/*.proto" +# Each fuzz crate (`src//fuzz`) is its own cargo `[workspace]`, so +# building it standalone creates `src//fuzz/target/`. Build-script deps +# (e.g. `protobuf-src`) extract `.proto` files into that tree, which buf must +# not scan. We exclude every fuzz crate's `target/` instead of hand-listing them. +FUZZ_CRATE_GLOB = f"{SOURCE_DIR}*/fuzz" GENERATION_COMMENT = "File generated by generate-buf-config.py - DO NOT EDIT" BUF_INSTRUCTION_PREFIX = "// buf breaking:" @@ -37,6 +42,11 @@ def is_ignore(self) -> bool: def collect_proto_files() -> list[ProtoFile]: print(f"Working dir: {os.getcwd()}") proto_file_paths = glob.glob(PROTO_FILE_GLOB, recursive=True) + # Filter out build artifacts: each fuzz crate is its own `[workspace]`, so + # building it standalone creates `src//fuzz/target/`. A build-script + # dep (`protobuf-src`) vendors protoc's bundled `.proto` files (Google's + # well-known types) into that tree, which is not source we want buf to scan. + proto_file_paths = [p for p in proto_file_paths if "/target/" not in p] return [ProtoFile(path) for path in proto_file_paths] @@ -82,6 +92,20 @@ def generate_buf_ignore_section(ignored_files: list[ProtoFile]) -> str: return "\n".join(ignore_entry_lines).strip() +def generate_fuzz_target_excludes() -> str: + fuzz_crate_dirs = sorted(d for d in glob.glob(FUZZ_CRATE_GLOB) if os.path.isdir(d)) + exclude_lines = [] + for fuzz_dir in fuzz_crate_dirs: + # e.g. "src/transform/fuzz" -> "transform/fuzz/target" + relative_path = fuzz_dir.removeprefix(SOURCE_DIR) + exclude_lines.append(f" - {relative_path}/target") + + if len(exclude_lines) == 0: + exclude_lines.append(" # none") + + return "\n".join(exclude_lines).strip() + + def write_buf_configuration( template_path: str, target_path: str, ignored_files: list[ProtoFile] ) -> None: @@ -92,6 +116,9 @@ def write_buf_configuration( content = content.replace( "${ignore-entries}", generate_buf_ignore_section(ignored_files) ) + content = content.replace( + "${fuzz-target-excludes}", generate_fuzz_target_excludes() + ) with open(target_path, "w") as output_file: output_file.write(content) diff --git a/misc/python/materialize/cli/ci_annotate_errors.py b/misc/python/materialize/cli/ci_annotate_errors.py index 5920bd05bcf06..4110b13c1b98b 100644 --- a/misc/python/materialize/cli/ci_annotate_errors.py +++ b/misc/python/materialize/cli/ci_annotate_errors.py @@ -100,6 +100,9 @@ # \s\S is any character including newlines, so this matches multiline strings # non-greedy using ? so that we don't match all the result comparison issues into one block | ----------\ RESULT\ COMPARISON\ ISSUE\ START\ ----------[\s\S]*?----------\ RESULT\ COMPARISON\ ISSUE\ END\ ------------ + # cargo-fuzz crash, emitted by the cargo-fuzz mzcompose runner (one block + # per failing target, with the crash input and a reproduce command) + | ----------\ CARGO-FUZZ\ FAILURE\ START\ ----------[\s\S]*?----------\ CARGO-FUZZ\ FAILURE\ END\ ---------- # output consistency tests # | possibly\ invalid\ operation\ specification # disabled # for miri test summary diff --git a/src/avro/fuzz/.gitignore b/src/avro/fuzz/.gitignore new file mode 100644 index 0000000000000..ab0eaa1a49031 --- /dev/null +++ b/src/avro/fuzz/.gitignore @@ -0,0 +1,5 @@ +target/ +corpus/ +artifacts/ +coverage/ +Cargo.lock diff --git a/src/avro/fuzz/Cargo.toml b/src/avro/fuzz/Cargo.toml new file mode 100644 index 0000000000000..96118872559ff --- /dev/null +++ b/src/avro/fuzz/Cargo.toml @@ -0,0 +1,43 @@ +# Fuzz crate for mz-avro decoders. Avro bytes arrive from Kafka, so a +# decoder bug here is a crash/poisoning risk for source ingestion. +# +# Excluded from the main workspace because libFuzzer requires nightly Rust. +# Run via the repo-wide runner: `bin/ci-builder run nightly ci/test/cargo-fuzz.sh`, +# or locally: +# cd src/avro/fuzz +# cargo +nightly fuzz run reader_decode -- -max_total_time=60 + +[package] +workspace = "../../../test/cargo-fuzz" +name = "mz-avro-fuzz" +version = "0.0.0" +publish = false +edition = "2021" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" +mz-avro = { path = ".." } + +[[bin]] +name = "reader_decode" +path = "fuzz_targets/reader_decode.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "schema_resolve" +path = "fuzz_targets/schema_resolve.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "avro_schema_parse" +path = "fuzz_targets/avro_schema_parse.rs" +test = false +doc = false +bench = false diff --git a/src/avro/fuzz/corpus.dict b/src/avro/fuzz/corpus.dict new file mode 100644 index 0000000000000..4740480132fd9 --- /dev/null +++ b/src/avro/fuzz/corpus.dict @@ -0,0 +1,44 @@ +# libFuzzer dictionary for the Avro reader_decode target. +# +# An Avro object-container file starts with the 4-byte magic "Obj\x01" followed +# by a map of metadata (notably "avro.schema" and "avro.codec") and a 16-byte +# sync marker. Without the magic the decoder bails immediately, so the most +# valuable token by far is the magic itself; the rest let the mutator build +# plausible headers and schemas. + +# Object-container magic. +"Obj\x01" +# Header metadata keys / codecs. +"avro.schema" +"avro.codec" +"null" +"deflate" +"snappy" +"zstandard" +"bzip2" +# Schema JSON tokens. +"\"type\"" +"\"name\"" +"\"fields\"" +"\"record\"" +"\"enum\"" +"\"array\"" +"\"map\"" +"\"union\"" +"\"fixed\"" +"\"symbols\"" +"\"items\"" +"\"values\"" +"\"size\"" +"\"namespace\"" +"\"default\"" +"\"logicalType\"" +"\"string\"" +"\"bytes\"" +"\"int\"" +"\"long\"" +"\"float\"" +"\"double\"" +"\"boolean\"" +"decimal" +"timestamp-millis" diff --git a/src/avro/fuzz/fuzz_targets/avro_schema_parse.rs b/src/avro/fuzz/fuzz_targets/avro_schema_parse.rs new file mode 100644 index 0000000000000..52cbe61c7a44c --- /dev/null +++ b/src/avro/fuzz/fuzz_targets/avro_schema_parse.rs @@ -0,0 +1,231 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `mz_avro::Schema::from_str` parses an Avro schema from JSON, +//! the schema that arrives from an external, possibly hostile, schema registry +//! (or straight from a user's `CREATE SOURCE … USING SCHEMA`). It is a +//! hand-written recursive descent over the parsed JSON, with named-type +//! definition/reference resolution, aliases, logical types, and a +//! `MAX_SCHEMA_DEPTH` guard against stack overflow on deeply nested types. +//! +//! `schema_resolve` only parses the narrow set of schemas it generates (no +//! logical types, no named back-references, shallow), and the decode targets +//! cap nesting low. This one stresses the parser itself: it generates schema +//! JSON that can nest *past* the depth limit (so the guard must fire cleanly +//! rather than overflow the stack) and that re-references already-defined names +//! (recursive definitions, a distinct resolution path). +//! +//! It also drives the parser's *naming* and *validation* paths: +//! * `namespace` fields and dotted/`a.b.C` names. The `FullName::from_parts` +//! split logic, including the documented edge case where a name has dots +//! *and* a `namespace` is also given (they may disagree). +//! * `aliases` arrays on named types, sometimes deliberately colliding with +//! another defined name or with the type's own name. +//! * structurally-valid-but-semantically-invalid schemas the parser is meant +//! to *reject* (not panic on): duplicate enum symbols, duplicate record +//! field names, decimal `scale > precision`, and `fixed` `size: 0`. +//! Parsing must never panic. It returns `Ok`/`Err`. + +#![no_main] + +use std::str::FromStr; + +use libfuzzer_sys::arbitrary::{self, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_avro::Schema; + +const PRIMITIVES: &[&str] = &[ + "null", "boolean", "int", "long", "float", "double", "bytes", "string", +]; + +/// Generate the optional `name`-affecting JSON attributes for a named type: +/// the `name` itself (sometimes dotted, i.e. namespace-qualified inline), an +/// optional separate `namespace`, and an optional `aliases` array (sometimes +/// colliding with another defined name or with the type's own name). The +/// returned `name` is the bare leaf name to record in `defined` for later +/// back-references. Exercises `Name::parse` / `FullName::from_parts`, including +/// the dotted-name-plus-`namespace` edge case the source flags as `[XXX]`. +fn gen_named_attrs( + u: &mut Unstructured, + leaf: &str, + defined: &[String], +) -> arbitrary::Result { + let mut attrs = String::new(); + + // The `name`: usually bare, sometimes dotted (carries an inline namespace), + // which drives the rfind('.') split in `FullName::from_parts`. + match u.int_in_range(0u8..=3)? { + 0 => attrs.push_str(&format!("\"name\":\"{leaf}\"")), + 1 => attrs.push_str(&format!("\"name\":\"ns.{leaf}\"")), + 2 => attrs.push_str(&format!("\"name\":\"a.b.{leaf}\"")), + // Leading dot → empty namespace component. + _ => attrs.push_str(&format!("\"name\":\".{leaf}\"")), + } + + // Optional separate `namespace`. Combined with a dotted name above, this + // hits the case where the computed and provided namespaces may disagree. + match u.int_in_range(0u8..=2)? { + 0 => {} + 1 => attrs.push_str(",\"namespace\":\"ns\""), + _ => attrs.push_str(",\"namespace\":\"other\""), + } + + // Optional `aliases` array. + match u.int_in_range(0u8..=4)? { + 0 | 1 => {} + 2 => attrs.push_str(",\"aliases\":[\"AnAlias\"]"), + // Alias colliding with the type's own (leaf) name. + 3 => attrs.push_str(&format!(",\"aliases\":[\"{leaf}\"]")), + // Alias colliding with some other already-defined name, if any. + _ => { + if let Some(other) = defined.last() { + attrs.push_str(&format!(",\"aliases\":[\"{other}\"]")); + } else { + attrs.push_str(",\"aliases\":[\"AnAlias\"]"); + } + } + } + + Ok(attrs) +} + +/// Generate a schema that is *structurally* valid JSON for a named type but +/// *semantically* invalid per the Avro spec. The parser must reject it with an +/// error rather than panic. +fn gen_invalid(u: &mut Unstructured, counter: &mut u32) -> arbitrary::Result { + *counter += 1; + Ok(match u.int_in_range(0u8..=4)? { + // Enum with duplicate symbols. + 0 => format!( + "{{\"type\":\"enum\",\"name\":\"E{counter}\",\"symbols\":[\"A\",\"B\",\"A\"]}}" + ), + // Record with duplicate field names. + 1 => format!( + "{{\"type\":\"record\",\"name\":\"R{counter}\",\"fields\":[\ + {{\"name\":\"dup\",\"type\":\"int\"}},\ + {{\"name\":\"dup\",\"type\":\"string\"}}]}}" + ), + // Decimal with scale > precision. + 2 => "{\"type\":\"bytes\",\"logicalType\":\"decimal\",\"precision\":2,\"scale\":9}" + .to_string(), + // Fixed with size 0. + 3 => format!("{{\"type\":\"fixed\",\"name\":\"F{counter}\",\"size\":0}}"), + // Enum default not among the symbols. + _ => format!( + "{{\"type\":\"enum\",\"name\":\"E{counter}\",\"symbols\":[\"A\",\"B\"],\"default\":\"Z\"}}" + ), + }) +} + +/// A leaf type: a primitive, a logical type, or a reference to an +/// already-defined named type (which makes the schema recursive). +fn gen_leaf(u: &mut Unstructured, defined: &[String]) -> arbitrary::Result { + Ok(match u.int_in_range(0u8..=7)? { + 0 => format!("\"{}\"", u.choose(PRIMITIVES)?), + 1 => "{\"type\":\"int\",\"logicalType\":\"date\"}".to_string(), + 2 => "{\"type\":\"long\",\"logicalType\":\"timestamp-micros\"}".to_string(), + 3 => "{\"type\":\"long\",\"logicalType\":\"timestamp-millis\"}".to_string(), + 4 => "{\"type\":\"string\",\"logicalType\":\"uuid\"}".to_string(), + 5 => { + let precision = u.int_in_range(1u32..=38)?; + let scale = u.int_in_range(0u32..=precision)?; + format!( + "{{\"type\":\"bytes\",\"logicalType\":\"decimal\",\"precision\":{precision},\"scale\":{scale}}}" + ) + } + // Reference a previously-defined name when one exists (recursive + // definition), otherwise fall back to a primitive. + _ if !defined.is_empty() => format!("\"{}\"", u.choose(defined)?), + _ => format!("\"{}\"", u.choose(PRIMITIVES)?), + }) +} + +fn gen_type( + u: &mut Unstructured, + counter: &mut u32, + defined: &mut Vec, + depth: u32, +) -> arbitrary::Result { + if depth == 0 || u.is_empty() { + return gen_leaf(u, defined); + } + Ok(match u.int_in_range(0u8..=11)? { + 0 | 1 => gen_leaf(u, defined)?, + // Array / map / union: the cheap recursive wrappers that let the + // fuzzer drive nesting depth (potentially past MAX_SCHEMA_DEPTH). + 2 => format!( + "{{\"type\":\"array\",\"items\":{}}}", + gen_type(u, counter, defined, depth - 1)? + ), + 3 => format!( + "{{\"type\":\"map\",\"values\":{}}}", + gen_type(u, counter, defined, depth - 1)? + ), + 4 => format!("[{}]", gen_type(u, counter, defined, depth - 1)?), + 5 => format!("[\"null\",{}]", gen_type(u, counter, defined, depth - 1)?), + // Record: defines a new name (recorded so later leaves can reference + // it). The name may be dotted/namespaced and may carry `aliases`. + 6 | 7 => { + *counter += 1; + let leaf = format!("R{counter}"); + let attrs = gen_named_attrs(u, &leaf, defined)?; + defined.push(leaf); + let n = u.int_in_range(0u8..=3)?; + let mut fields = Vec::with_capacity(n.into()); + for i in 0..n { + let ty = gen_type(u, counter, defined, depth - 1)?; + fields.push(format!("{{\"name\":\"f{i}\",\"type\":{ty}}}")); + } + format!( + "{{\"type\":\"record\",{attrs},\"fields\":[{}]}}", + fields.join(",") + ) + } + // Enum: also name-attribute-bearing, sometimes with a `default` symbol. + 8 => { + *counter += 1; + let leaf = format!("E{counter}"); + let attrs = gen_named_attrs(u, &leaf, defined)?; + defined.push(leaf); + let default = if u.int_in_range(0u8..=1)? == 0 { + ",\"default\":\"B\"" + } else { + "" + }; + format!("{{\"type\":\"enum\",{attrs},\"symbols\":[\"A\",\"B\",\"C\"]{default}}}") + } + // Fixed. + 9 => { + *counter += 1; + let leaf = format!("F{counter}"); + let attrs = gen_named_attrs(u, &leaf, defined)?; + defined.push(leaf); + let size = u.int_in_range(0u32..=32)?; + format!("{{\"type\":\"fixed\",{attrs},\"size\":{size}}}") + } + // A structurally-valid-but-semantically-invalid named type: the parser + // must reject it cleanly rather than panic. + _ => gen_invalid(u, counter)?, + }) +} + +fn run(mut u: Unstructured) -> arbitrary::Result<()> { + let mut counter = 0u32; + let mut defined = Vec::new(); + // Start deeper than MAX_SCHEMA_DEPTH (128) so the fuzzer can drive nesting + // past the limit and exercise the depth guard, not just shallow schemas. + let depth = u.int_in_range(1u32..=200)?; + let schema = gen_type(&mut u, &mut counter, &mut defined, depth)?; + let _ = Schema::from_str(&schema); + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let _ = run(Unstructured::new(data)); +}); diff --git a/src/avro/fuzz/fuzz_targets/reader_decode.rs b/src/avro/fuzz/fuzz_targets/reader_decode.rs new file mode 100644 index 0000000000000..f64f3b34101de --- /dev/null +++ b/src/avro/fuzz/fuzz_targets/reader_decode.rs @@ -0,0 +1,379 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: decoding arbitrary bytes as an Avro object-container file +//! (header + blocks) must not crash. Avro bytes arrive from Kafka and +//! externally-managed schema registries, so any panic/SEGV reachable from +//! the wire format is a real availability bug. +//! +//! An OCF starts with a 4-byte magic, a metadata map carrying the writer +//! schema, and a 16-byte sync marker, then length-framed data blocks each +//! terminated by that same sync marker. Random bytes fail the magic check on +//! the first four bytes, so a raw-byte body never gets past `Reader::new`. The +//! block iteration, sync-marker matching, and per-object value decode (the bulk +//! of the surface) then stay completely unexercised. So we *build* a valid +//! container: generate a structured writer schema (`Ty`), serialize it into the +//! header's `avro.schema` metadata, and Avro-binary-encode the objects of each +//! block against that schema. +//! +//! The compression codec is chosen per file rather than hardcoded to `null`, +//! which exercises the otherwise-dead decompress dispatch in `read_block_next`: +//! * `null`: the encoded block bytes pass straight through. +//! * `deflate` with *correctly* compressed bytes: drives the happy-path +//! `Codec::decompress` (flate2) round-trip the writer normally produces. +//! * `deflate` with the *raw* (uncompressed) bytes left in place: the deflate +//! decoder hits a malformed stream, exercising the decompress-error path. +//! * an unrecognized codec string: the `UnrecognizedCodec` header-parse +//! error. (`snappy` is gated behind a Cargo feature that this fuzz crate +//! does not enable, so it is intentionally not generated.) +//! We also corrupt the declared block framing: occasionally the object-count or +//! byte-size prefix is replaced with a huge or wildly inconsistent value, which +//! must be caught by the `safe_len` allocation guard rather than spinning or +//! over-allocating. And we still hit the structural error paths, occasionally a +//! corrupted trailing sync marker (the mismatch path) or a whole-file truncation +//! mid-block (the short-read path). Decoding must never panic any of these ways. + +#![no_main] + +use libfuzzer_sys::arbitrary::{self, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_avro::{Codec, Reader}; + +/// A generated Avro type. Structured (not straight-to-JSON) so the object +/// encoder can walk the same type. An array is one item type in the schema but +/// N item values in a block. +enum Ty { + Null, + Bool, + Int, + Long, + Float, + Double, + String, + Bytes, + Fixed(u32, u32), + Enum(u32), + Record(u32, Vec), + Array(Box), + Map(Box), + Nullable(Box), +} + +fn gen_ty(u: &mut Unstructured, counter: &mut u32, depth: u32) -> arbitrary::Result { + let choice = if depth == 0 || u.is_empty() { + u.int_in_range(0u8..=9)? + } else { + u.int_in_range(0u8..=13)? + }; + Ok(match choice { + 0 => Ty::Null, + 1 => Ty::Bool, + 2 => Ty::Int, + 3 => Ty::Long, + 4 => Ty::Float, + 5 => Ty::Double, + 6 => Ty::String, + 7 => Ty::Bytes, + 8 => { + *counter += 1; + let size = u.int_in_range(0u32..=24)?; + Ty::Fixed(*counter, size) + } + 9 => { + *counter += 1; + Ty::Enum(*counter) + } + 10 => { + *counter += 1; + let name = *counter; + let n = u.int_in_range(0u8..=3)?; + let mut fields = Vec::with_capacity(n.into()); + for _ in 0..n { + fields.push(gen_ty(u, counter, depth - 1)?); + } + Ty::Record(name, fields) + } + 11 => Ty::Array(Box::new(gen_ty(u, counter, depth - 1)?)), + 12 => Ty::Map(Box::new(gen_ty(u, counter, depth - 1)?)), + _ => Ty::Nullable(Box::new(gen_ty(u, counter, depth - 1)?)), + }) +} + +fn ty_to_json(ty: &Ty, out: &mut String) { + match ty { + Ty::Null => out.push_str("\"null\""), + Ty::Bool => out.push_str("\"boolean\""), + Ty::Int => out.push_str("\"int\""), + Ty::Long => out.push_str("\"long\""), + Ty::Float => out.push_str("\"float\""), + Ty::Double => out.push_str("\"double\""), + Ty::String => out.push_str("\"string\""), + Ty::Bytes => out.push_str("\"bytes\""), + Ty::Fixed(n, size) => { + out.push_str(&format!("{{\"type\":\"fixed\",\"name\":\"F{n}\",\"size\":{size}}}")) + } + Ty::Enum(n) => out.push_str(&format!( + "{{\"type\":\"enum\",\"name\":\"E{n}\",\"symbols\":[\"A\",\"B\",\"C\"]}}" + )), + Ty::Record(n, fields) => { + out.push_str(&format!("{{\"type\":\"record\",\"name\":\"R{n}\",\"fields\":[")); + for (i, f) in fields.iter().enumerate() { + if i > 0 { + out.push(','); + } + out.push_str(&format!("{{\"name\":\"g{i}\",\"type\":")); + ty_to_json(f, out); + out.push('}'); + } + out.push_str("]}"); + } + Ty::Array(item) => { + out.push_str("{\"type\":\"array\",\"items\":"); + ty_to_json(item, out); + out.push('}'); + } + Ty::Map(values) => { + out.push_str("{\"type\":\"map\",\"values\":"); + ty_to_json(values, out); + out.push('}'); + } + Ty::Nullable(inner) => { + out.push_str("[\"null\","); + ty_to_json(inner, out); + out.push(']'); + } + } +} + +/// Avro encodes int/long (and all block counts and lengths) as zig-zag varints. +fn encode_long(n: i64, out: &mut Vec) { + let mut z = ((n << 1) ^ (n >> 63)) as u64; + loop { + if z & !0x7f == 0 { + out.push(z as u8); + return; + } + out.push(((z & 0x7f) | 0x80) as u8); + z >>= 7; + } +} + +/// Encode length-prefixed bytes (the wire form of both `string` and `bytes`, +/// and of each Avro map/metadata entry). +fn encode_blob(bytes: &[u8], out: &mut Vec) { + encode_long(bytes.len() as i64, out); + out.extend_from_slice(bytes); +} + +fn encode_str(u: &mut Unstructured, out: &mut Vec) -> arbitrary::Result<()> { + let n = u.int_in_range(0usize..=8)?; + let mut s = Vec::with_capacity(n); + for _ in 0..n { + s.push(u.int_in_range(0x20u8..=0x7e)?); + } + encode_blob(&s, out); + Ok(()) +} + +/// Avro-binary-encode one value of type `ty`. +fn encode_value(u: &mut Unstructured, ty: &Ty, out: &mut Vec) -> arbitrary::Result<()> { + match ty { + Ty::Null => {} + Ty::Bool => out.push(u.int_in_range(0u8..=1)?), + Ty::Int => encode_long(i64::from(u.arbitrary::()?), out), + Ty::Long => encode_long(u.arbitrary::()?, out), + Ty::Float => out.extend_from_slice(&u.arbitrary::()?.to_le_bytes()), + Ty::Double => out.extend_from_slice(&u.arbitrary::()?.to_le_bytes()), + Ty::String => encode_str(u, out)?, + Ty::Bytes => { + let n = u.int_in_range(0usize..=12)?; + let mut b = Vec::with_capacity(n); + for _ in 0..n { + b.push(u.arbitrary::()?); + } + encode_blob(&b, out); + } + Ty::Fixed(_, size) => { + for _ in 0..*size { + out.push(u.arbitrary::()?); + } + } + Ty::Enum(_) => encode_long(u.int_in_range(0i64..=3)?, out), + Ty::Record(_, fields) => { + for f in fields { + encode_value(u, f, out)?; + } + } + Ty::Array(item) => { + let n = u.int_in_range(0i64..=3)?; + if n > 0 { + encode_long(n, out); + for _ in 0..n { + encode_value(u, item, out)?; + } + } + encode_long(0, out); + } + Ty::Map(values) => { + let n = u.int_in_range(0i64..=3)?; + if n > 0 { + encode_long(n, out); + for _ in 0..n { + encode_str(u, out)?; // key + encode_value(u, values, out)?; + } + } + encode_long(0, out); + } + Ty::Nullable(inner) => { + if u.int_in_range(0u8..=3)? == 0 { + encode_long(0, out); // branch 0 = null + } else { + encode_long(1, out); // branch 1 = T + encode_value(u, inner, out)?; + } + } + } + Ok(()) +} + +fn run(mut u: Unstructured) -> arbitrary::Result<()> { + // A quarter of the time, feed the raw remaining bytes: this exercises the + // magic-byte rejection and the header-parse error paths that a well-formed + // container always skips. + if u.int_in_range(0u8..=3)? == 0 { + let data = u.take_rest(); + if let Ok(reader) = Reader::new(data) { + for item in reader { + let _ = item; + } + } + return Ok(()); + } + + // Writer schema: a record (the usual OCF top-level), spanning the value + // types the general decoder walks. + let mut counter = 0u32; + let n = u.int_in_range(1u8..=6)?; + let mut fields = Vec::with_capacity(n.into()); + for _ in 0..n { + fields.push(gen_ty(&mut u, &mut counter, 3)?); + } + let top = Ty::Record(0, fields); + let mut schema = String::new(); + ty_to_json(&top, &mut schema); + + // Pick a per-file codec. `CodecMode` says both what string goes in the + // header and how (or whether) each block's encoded bytes get compressed, so + // we can drive the happy-path decompress *and* the decompress-error path. + enum CodecMode { + Null, + /// Correctly deflate-compress each block (the writer's normal output). + Deflate, + /// Declare `deflate` but leave the bytes uncompressed (malformed stream). + DeflateGarbage, + /// An unrecognized codec name → `UnrecognizedCodec` at header parse. + Unrecognized, + } + let codec_mode = match u.int_in_range(0u8..=5)? { + 0 | 1 => CodecMode::Null, + 2 | 3 => CodecMode::Deflate, + 4 => CodecMode::DeflateGarbage, + _ => CodecMode::Unrecognized, + }; + let codec_name: &[u8] = match codec_mode { + CodecMode::Null => b"null", + CodecMode::Deflate | CodecMode::DeflateGarbage => b"deflate", + // A name that is never a real codec. `Reader::new` must reject it + // cleanly rather than panic. + CodecMode::Unrecognized => b"lz4-but-not-really", + }; + + let mut out = Vec::new(); + out.extend_from_slice(b"Obj\x01"); // OCF magic + // File metadata: a map with the writer schema and the chosen + // codec, as one block of two entries. + encode_long(2, &mut out); + encode_blob(b"avro.schema", &mut out); + encode_blob(schema.as_bytes(), &mut out); + encode_blob(b"avro.codec", &mut out); + encode_blob(codec_name, &mut out); + encode_long(0, &mut out); // end of metadata map + let sync: [u8; 16] = u.arbitrary()?; + out.extend_from_slice(&sync); + + // Data blocks: each is object-count, byte-size, the (possibly compressed) + // encoded objects, then the file's sync marker. + let nblocks = u.int_in_range(0u8..=2)?; + for _ in 0..nblocks { + let nobj = u.int_in_range(1i64..=3)?; + let mut objs = Vec::new(); + for _ in 0..nobj { + encode_value(&mut u, &top, &mut objs)?; + } + // Compress the block payload to match the declared codec. With + // `DeflateGarbage` we deliberately leave it uncompressed so the deflate + // decoder sees a malformed stream. + match codec_mode { + CodecMode::Deflate => { + // `compress` is infallible for in-memory deflate. If it ever + // errored we'd just skip this input. + if Codec::Deflate.compress(&mut objs).is_err() { + return Ok(()); + } + } + CodecMode::Null | CodecMode::DeflateGarbage | CodecMode::Unrecognized => {} + } + + // The block framing prefixes. Usually honest, but occasionally we lie + // about the object count or the byte size, including absurdly large or + // negative (→ huge `usize`) values, to exercise the `safe_len` guard + // and the short-read handling, neither of which may panic. + let (count_prefix, size_prefix) = match u.int_in_range(0u8..=9)? { + // Honest framing (the common case). + 0..=6 => (nobj, objs.len() as i64), + // Huge object count with the real byte size: the reader must cap + // the count via `safe_len` instead of trying to decode billions. + 7 => (u.int_in_range(1i64 << 40..=i64::MAX)?, objs.len() as i64), + // Negative byte size (wraps to an enormous `usize`): `safe_len` + // must reject it rather than attempt a giant allocation. + 8 => (nobj, -u.int_in_range(1i64..=i64::MAX)?), + // Byte size far larger than the bytes actually present → short read. + _ => (nobj, objs.len() as i64 + (1i64 << 30)), + }; + encode_long(count_prefix, &mut out); + encode_long(size_prefix, &mut out); + out.extend_from_slice(&objs); + // Usually the correct sync, occasionally a wrong one to drive the + // sync-marker mismatch path. + if u.int_in_range(0u8..=7)? == 0 { + out.extend_from_slice(&u.arbitrary::<[u8; 16]>()?); + } else { + out.extend_from_slice(&sync); + } + } + + // Occasionally truncate the whole file mid-block (short-read path). + if !out.is_empty() && u.int_in_range(0u8..=7)? == 0 { + let keep = u.int_in_range(0usize..=out.len())?; + out.truncate(keep); + } + + let Ok(reader) = Reader::new(&out[..]) else { + return Ok(()); + }; + for item in reader { + let _ = item; + } + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let _ = run(Unstructured::new(data)); +}); diff --git a/src/avro/fuzz/fuzz_targets/schema_resolve.rs b/src/avro/fuzz/fuzz_targets/schema_resolve.rs new file mode 100644 index 0000000000000..2488ada64d1dd --- /dev/null +++ b/src/avro/fuzz/fuzz_targets/schema_resolve.rs @@ -0,0 +1,465 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `resolve_schemas` reconciles a writer schema with a reader +//! schema (the core of decoding Kafka Avro data whose writer schema came from +//! an external, possibly hostile, schema registry). It walks both schemas in +//! lock-step doing type promotion, default substitution, and union matching, so +//! a panic here is an availability bug for source ingestion. +//! +//! Random bytes almost never parse as an Avro schema, so we generate *valid* +//! schema JSON from the fuzz input. Two independently-named random schemas +//! almost never line up, so resolution would fail at the very first node (a +//! record/enum/fixed name mismatch) and the interesting resolve branches would +//! stay dead. +//! +//! Instead we generate one structured shape (`Shape`) and emit *two* JSON +//! renderings of it: a writer rendering and a reader rendering that share all +//! record/enum/fixed names but deliberately differ in ways the resolver is +//! supposed to handle, so we actually reach its non-trivial paths: +//! * primitive promotion. A writer `int` rendered as reader `long`/`float`/ +//! `double`, `long`→`float`/`double`, `float`→`double` (the `ResolveIntLong` +//! / `ResolveFloatDouble` / … machinery). +//! * default substitution. The reader record sometimes carries an *extra* +//! trailing field (absent from the writer) with a JSON `default`, driving +//! the "reader field not in writer, use default" branch in `resolve_named`. +//! * union matching. Multi-variant unions whose variants the resolver must +//! match up by type/name across writer and reader. +//! * enums with a `default` symbol. +//! We resolve writer-against-itself (identity), and both cross-directions. +//! +//! A panic is not the only failure mode, though. `resolve_schemas` can return +//! `Ok` while *deferring* a match failure into the resolved schema. It might +//! store an `Err` inside a `ResolveUnionUnion` permutation, say, which then +//! re-raises only when a record actually expresses that branch at decode time. +//! That is exactly the shape of : +//! an `int`→`double` promotion inside a `["null", T]` union resolved to `Ok` +//! but failed to decode. A "doesn't panic" oracle that discards the `Result` +//! sees nothing wrong. So beyond requiring no panic, we add a *decode* oracle: +//! the reader rendering only ever widens the writer, so every node, every +//! union branch included, has a valid reader target, and decoding a +//! writer-encoded value through the writer→reader resolved schema MUST succeed. +//! A deferred mismatch turns that decode into an error, which this target +//! treats as a finding. + +#![no_main] + +use libfuzzer_sys::arbitrary::{self, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_avro::schema::resolve_schemas; +use mz_avro::{Schema, from_avro_datum}; + +/// One of the primitive Avro types, ordered by promotability so the reader +/// rendering can pick a "wider" target. `int` ⊑ `long` ⊑ `float` ⊑ `double`. +const PROMO_CHAIN: &[&str] = &["int", "long", "float", "double"]; +const OTHER_PRIMS: &[&str] = &["null", "boolean", "bytes", "string"]; + +/// A structured schema shape. Generated once, then rendered twice (writer / +/// reader) with controlled per-rendering variation. `names` are stable across +/// both renderings so named types line up during resolution. +enum Shape { + /// A primitive on the promotion chain (index into `PROMO_CHAIN`). + Promotable(usize), + /// A primitive that has no promotion (rendered identically on both sides). + OtherPrim(&'static str), + /// `[..]` union with N>=1 variants. + Union(Vec), + Array(Box), + Map(Box), + Record { + name: u32, + fields: Vec, + /// Whether the reader rendering appends an extra defaulted field. + reader_extra_default: bool, + }, + Enum { + name: u32, + /// Whether the reader rendering gives the enum a `default` symbol. + reader_default: bool, + }, + Fixed { + name: u32, + size: u8, + }, +} + +fn gen_shape(u: &mut Unstructured, counter: &mut u32, depth: u32) -> arbitrary::Result { + let choice = if depth == 0 || u.is_empty() { + u.int_in_range(0u8..=1)? + } else { + u.int_in_range(0u8..=8)? + }; + Ok(match choice { + 0 => Shape::Promotable(usize::from(u.int_in_range(0u8..=3)?)), + 1 => Shape::OtherPrim(u.choose(OTHER_PRIMS)?), + 2 => { + // Multi-variant union. We draw distinct primitive-ish variants so + // the union stays valid (Avro forbids duplicate non-named types). + let n = u.int_in_range(1u8..=3)?; + let mut variants = Vec::with_capacity(n.into()); + // First variant is often null (the common nullable shape). + if u.int_in_range(0u8..=1)? == 0 { + variants.push(Shape::OtherPrim("null")); + } + // Then a small set of distinct promotable primitives. + let mut used = [false; 4]; + for _ in 0..n { + let idx = usize::from(u.int_in_range(0u8..=3)?); + if !used[idx] { + used[idx] = true; + variants.push(Shape::Promotable(idx)); + } + } + if variants.is_empty() { + variants.push(Shape::Promotable(0)); + } + Shape::Union(variants) + } + 3 => Shape::Array(Box::new(gen_shape(u, counter, depth - 1)?)), + 4 => Shape::Map(Box::new(gen_shape(u, counter, depth - 1)?)), + 5 | 6 => { + *counter += 1; + let name = *counter; + let n = u.int_in_range(0u8..=3)?; + let mut fields = Vec::with_capacity(n.into()); + for _ in 0..n { + fields.push(gen_shape(u, counter, depth - 1)?); + } + Shape::Record { + name, + fields, + reader_extra_default: u.int_in_range(0u8..=1)? == 0, + } + } + 7 => { + *counter += 1; + Shape::Enum { + name: *counter, + reader_default: u.int_in_range(0u8..=1)? == 0, + } + } + _ => { + *counter += 1; + Shape::Fixed { + name: *counter, + size: u.int_in_range(1u8..=16)?, + } + } + }) +} + +/// Render the *writer* version of `shape` to schema JSON: promotable +/// primitives use their base type, records carry only their real fields, and +/// enums have no default. +fn render_writer(shape: &Shape, out: &mut String) { + match shape { + Shape::Promotable(idx) => { + out.push('"'); + out.push_str(PROMO_CHAIN[*idx]); + out.push('"'); + } + Shape::OtherPrim(p) => { + out.push('"'); + out.push_str(p); + out.push('"'); + } + Shape::Union(variants) => { + out.push('['); + for (i, v) in variants.iter().enumerate() { + if i > 0 { + out.push(','); + } + render_writer(v, out); + } + out.push(']'); + } + Shape::Array(item) => { + out.push_str("{\"type\":\"array\",\"items\":"); + render_writer(item, out); + out.push('}'); + } + Shape::Map(values) => { + out.push_str("{\"type\":\"map\",\"values\":"); + render_writer(values, out); + out.push('}'); + } + Shape::Record { name, fields, .. } => { + out.push_str(&format!("{{\"type\":\"record\",\"name\":\"N{name}\",\"fields\":[")); + for (i, f) in fields.iter().enumerate() { + if i > 0 { + out.push(','); + } + out.push_str(&format!("{{\"name\":\"f{i}\",\"type\":")); + render_writer(f, out); + out.push('}'); + } + out.push_str("]}"); + } + Shape::Enum { name, .. } => { + out.push_str(&format!( + "{{\"type\":\"enum\",\"name\":\"N{name}\",\"symbols\":[\"A\",\"B\",\"C\"]}}" + )); + } + Shape::Fixed { name, size } => { + out.push_str(&format!( + "{{\"type\":\"fixed\",\"name\":\"N{name}\",\"size\":{size}}}" + )); + } + } +} + +/// Render the *reader* version of `shape`: widens each promotable primitive to +/// a (fuzz-chosen) wider type on the promotion chain, appends a defaulted +/// `extra` record field, and gives enums a `default` symbol. These are all the +/// schema-evolution shapes `resolve_schemas` handles. +fn render_reader_promoted( + u: &mut Unstructured, + shape: &Shape, + out: &mut String, +) -> arbitrary::Result<()> { + match shape { + Shape::Promotable(idx) => { + // Choose a target at or after `idx` on the chain, a valid + // promotion the resolver should accept. + let target = u.int_in_range(*idx..=PROMO_CHAIN.len() - 1)?; + out.push('"'); + out.push_str(PROMO_CHAIN[target]); + out.push('"'); + } + Shape::OtherPrim(p) => { + out.push('"'); + out.push_str(p); + out.push('"'); + } + Shape::Union(variants) => { + out.push('['); + for (i, v) in variants.iter().enumerate() { + if i > 0 { + out.push(','); + } + render_reader_promoted(u, v, out)?; + } + out.push(']'); + } + Shape::Array(item) => { + out.push_str("{\"type\":\"array\",\"items\":"); + render_reader_promoted(u, item, out)?; + out.push('}'); + } + Shape::Map(values) => { + out.push_str("{\"type\":\"map\",\"values\":"); + render_reader_promoted(u, values, out)?; + out.push('}'); + } + Shape::Record { + name, + fields, + reader_extra_default, + } => { + out.push_str(&format!("{{\"type\":\"record\",\"name\":\"N{name}\",\"fields\":[")); + for (i, f) in fields.iter().enumerate() { + if i > 0 { + out.push(','); + } + out.push_str(&format!("{{\"name\":\"f{i}\",\"type\":")); + render_reader_promoted(u, f, out)?; + out.push('}'); + } + if *reader_extra_default { + if !fields.is_empty() { + out.push(','); + } + out.push_str("{\"name\":\"extra\",\"type\":\"long\",\"default\":7}"); + } + out.push_str("]}"); + } + Shape::Enum { name, reader_default } => { + out.push_str(&format!( + "{{\"type\":\"enum\",\"name\":\"N{name}\",\"symbols\":[\"A\",\"B\",\"C\"]" + )); + if *reader_default { + out.push_str(",\"default\":\"A\""); + } + out.push_str("}"); + } + Shape::Fixed { name, size } => { + out.push_str(&format!( + "{{\"type\":\"fixed\",\"name\":\"N{name}\",\"size\":{size}}}" + )); + } + } + Ok(()) +} + +/// Avro encodes `int`/`long` (and union branch indices, array/map block +/// counts, and blob lengths) as zig-zag varints. +fn encode_long(n: i64, out: &mut Vec) { + let mut z = ((n << 1) ^ (n >> 63)) as u64; + loop { + if z & !0x7f == 0 { + out.push(z as u8); + return; + } + out.push(((z & 0x7f) | 0x80) as u8); + z >>= 7; + } +} + +/// Length-prefixed bytes, the wire form of `string`, `bytes`, and each map key. +fn encode_blob(bytes: &[u8], out: &mut Vec) { + encode_long(bytes.len() as i64, out); + out.extend_from_slice(bytes); +} + +/// Avro-binary-encode one value matching the *writer* rendering of `shape` +/// (i.e. `render_writer`'s wire format), so it can be decoded back through a +/// resolved schema. For unions we deliberately pick a *promotable* branch: +/// that is the branch whose resolved decode walks the numeric-promotion path, +/// the exact spot where #37087 deferred a union match failure into the resolved +/// schema and re-raised it here at decode time. Every generated union has at +/// least one promotable variant, so the search never falls back. +fn encode_writer_value( + u: &mut Unstructured, + shape: &Shape, + out: &mut Vec, +) -> arbitrary::Result<()> { + match shape { + Shape::Promotable(idx) => match PROMO_CHAIN[*idx] { + "int" => encode_long(i64::from(u.arbitrary::()?), out), + "long" => encode_long(u.arbitrary::()?, out), + "float" => out.extend_from_slice(&u.arbitrary::()?.to_le_bytes()), + // "double" + _ => out.extend_from_slice(&u.arbitrary::()?.to_le_bytes()), + }, + Shape::OtherPrim(p) => match *p { + "null" => {} + "boolean" => out.push(u.int_in_range(0u8..=1)?), + // "bytes" | "string": both length-prefixed on the wire. + _ => { + let n = u.int_in_range(0usize..=8)?; + let mut b = Vec::with_capacity(n); + for _ in 0..n { + b.push(u.int_in_range(0x20u8..=0x7e)?); + } + encode_blob(&b, out); + } + }, + Shape::Union(variants) => { + let branch = variants + .iter() + .position(|v| matches!(v, Shape::Promotable(_))) + .unwrap_or(0); + encode_long(branch as i64, out); + encode_writer_value(u, &variants[branch], out)?; + } + Shape::Array(item) => { + let n = u.int_in_range(0i64..=3)?; + if n > 0 { + encode_long(n, out); + for _ in 0..n { + encode_writer_value(u, item, out)?; + } + } + encode_long(0, out); // end-of-array block marker + } + Shape::Map(values) => { + let n = u.int_in_range(0i64..=3)?; + if n > 0 { + encode_long(n, out); + for _ in 0..n { + let k = u.int_in_range(0usize..=4)?; + let mut key = Vec::with_capacity(k); + for _ in 0..k { + key.push(u.int_in_range(0x61u8..=0x7a)?); + } + encode_blob(&key, out); + encode_writer_value(u, values, out)?; + } + } + encode_long(0, out); // end-of-map block marker + } + Shape::Record { fields, .. } => { + for f in fields { + encode_writer_value(u, f, out)?; + } + } + // The writer enum's symbols are always `["A","B","C"]`. + Shape::Enum { .. } => encode_long(u.int_in_range(0i64..=2)?, out), + Shape::Fixed { size, .. } => { + for _ in 0..*size { + out.push(u.arbitrary::()?); + } + } + } + Ok(()) +} + +fn run(mut u: Unstructured) -> arbitrary::Result<()> { + let mut counter = 0u32; + // The top level of an OCF/registry schema is virtually always a record. + counter += 1; + let top_name = counter; + let n = u.int_in_range(0u8..=4)?; + let mut fields = Vec::with_capacity(n.into()); + for _ in 0..n { + fields.push(gen_shape(&mut u, &mut counter, 3)?); + } + let shape = Shape::Record { + name: top_name, + fields, + reader_extra_default: u.int_in_range(0u8..=1)? == 0, + }; + + let mut writer_json = String::new(); + render_writer(&shape, &mut writer_json); + let mut reader_json = String::new(); + render_reader_promoted(&mut u, &shape, &mut reader_json)?; + + let Ok(writer) = writer_json.parse::() else { + return Ok(()); + }; + + // Encode one value against the writer rendering. We decode it back through + // the resolved schemas below. The writer only ever widens into the reader, + // so every node, every union branch included, has a valid reader target + // and decoding the writer's own bytes through the resolved schema must + // succeed. A `resolve_schemas` that returned `Ok` but deferred a fixable + // mismatch (see the module comment / #37087) surfaces it here as a decode + // error rather than slipping past a panic-only oracle. + let mut writer_value = Vec::new(); + encode_writer_value(&mut u, &shape, &mut writer_value)?; + + // Identity resolution must succeed and decode the writer's own bytes. + if let Ok(resolved) = resolve_schemas(&writer, &writer) { + from_avro_datum(&resolved, &mut &writer_value[..]) + .expect("decode through identity-resolved schema must succeed"); + } + if let Ok(reader) = reader_json.parse::() { + // Writer→reader is the *widening* direction: it hits the promotion / + // default / union-match branches, must resolve, and the resolved schema + // must decode the writer's bytes. A deferred promotion mismatch turns + // this `expect` into the fuzzer's signal. + if let Ok(resolved) = resolve_schemas(&writer, &reader) { + from_avro_datum(&resolved, &mut &writer_value[..]).expect( + "decode through writer→reader resolved schema must succeed; a failure means \ + resolution deferred a fixable mismatch into the resolved schema (see #37087)", + ); + } + // The reverse (reader→writer) *narrows*, so its resolution may + // legitimately fail or defer a genuine mismatch. We only require that + // neither direction panics. + let _ = resolve_schemas(&reader, &writer); + let _ = resolve_schemas(&reader, &reader); + } + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let _ = run(Unstructured::new(data)); +}); diff --git a/src/avro/fuzz/prepare-corpus.sh b/src/avro/fuzz/prepare-corpus.sh new file mode 100755 index 0000000000000..cfa2c6df73569 --- /dev/null +++ b/src/avro/fuzz/prepare-corpus.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash + +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. +# +# prepare-corpus.sh: populate cargo-fuzz seed corpora with valid Avro +# container files so the fuzzer doesn't waste cycles bouncing off the +# magic-header check. libFuzzer mutates these into deeper structural +# variants while still hitting real decoder code paths. + +set -euo pipefail + +cd "$(dirname "$0")" + +mkdir -p corpus/reader_decode +find corpus/reader_decode -maxdepth 1 -name 'seed_*.avro' -delete +cp ../benches/quickstop-null.avro corpus/reader_decode/seed_01_quickstop_null.avro + +echo "Seeded:" +for d in corpus/*/; do + count=$(find "$d" -maxdepth 1 -name '*.avro' | wc -l) + printf " %-40s %4d seeds\n" "$d" "$count" +done diff --git a/src/buf.yaml b/src/buf.yaml index 43269aa875ada..b54e97c886127 100644 --- a/src/buf.yaml +++ b/src/buf.yaml @@ -28,3 +28,25 @@ build: - fivetran-destination/proto - testdrive/ci/protobuf-bin - testdrive/ci/protobuf-include + # Fuzz crates have their own `[workspace]` (required for nightly + + # libFuzzer) so each maintains its own `target/`. Build-script deps + # like `protobuf-src` extract `.proto` files into that tree. + # Don't scan it. This list is auto-generated from `src/*/fuzz` by + # generate-buf-config.py, so new fuzz crates are covered automatically. + - avro/fuzz/target + - catalog-protos/fuzz/target + - expr/fuzz/target + - interchange/fuzz/target + - mysql-util/fuzz/target + - persist-client/fuzz/target + - pgcopy/fuzz/target + - pgrepr/fuzz/target + - pgtz/fuzz/target + - pgwire/fuzz/target + - postgres-util/fuzz/target + - repr/fuzz/target + - sql-parser/fuzz/target + - sql-server-util/fuzz/target + - storage-types/fuzz/target + - storage/fuzz/target + - transform/fuzz/target diff --git a/src/buf.yaml.template b/src/buf.yaml.template index 35297a81c7b48..4dbd1688d6c98 100644 --- a/src/buf.yaml.template +++ b/src/buf.yaml.template @@ -25,3 +25,9 @@ build: - fivetran-destination/proto - testdrive/ci/protobuf-bin - testdrive/ci/protobuf-include + # Fuzz crates have their own `[workspace]` (required for nightly + + # libFuzzer) so each maintains its own `target/`. Build-script deps + # like `protobuf-src` extract `.proto` files into that tree. + # Don't scan it. This list is auto-generated from `src/*/fuzz` by + # generate-buf-config.py, so new fuzz crates are covered automatically. + ${fuzz-target-excludes} diff --git a/src/catalog-protos/fuzz/.gitignore b/src/catalog-protos/fuzz/.gitignore new file mode 100644 index 0000000000000..ab0eaa1a49031 --- /dev/null +++ b/src/catalog-protos/fuzz/.gitignore @@ -0,0 +1,5 @@ +target/ +corpus/ +artifacts/ +coverage/ +Cargo.lock diff --git a/src/catalog-protos/fuzz/Cargo.toml b/src/catalog-protos/fuzz/Cargo.toml new file mode 100644 index 0000000000000..20bf0ccbce0c0 --- /dev/null +++ b/src/catalog-protos/fuzz/Cargo.toml @@ -0,0 +1,31 @@ +# Fuzz crate for mz-catalog-protos serde round-trip properties. Catalog +# state is durable on-disk data, so a decoder bug here is a catalog-corruption +# risk. mz-catalog-protos uses serde (not prost) for the on-wire form, so +# the fuzz target is a serde_json round-trip rather than a proto one. + +[package] +workspace = "../../../test/cargo-fuzz" +name = "mz-catalog-protos-fuzz" +version = "0.0.0" +publish = false +edition = "2021" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" +mz-catalog-protos = { path = "..", features = ["proptest"] } +proptest = "1" +serde = "1.0" +serde_json = "1.0" + +[[bin]] +name = "catalog_objects_serde_roundtrip" +path = "fuzz_targets/catalog_objects_serde_roundtrip.rs" +test = false +doc = false +bench = false + +# The fuzz crate has its own `[workspace]` so it must duplicate the root's +# `[patch.crates-io]`. Keep in sync with the root `Cargo.toml`. diff --git a/src/catalog-protos/fuzz/fuzz_targets/catalog_objects_serde_roundtrip.rs b/src/catalog-protos/fuzz/fuzz_targets/catalog_objects_serde_roundtrip.rs new file mode 100644 index 0000000000000..d2a9171f43fc9 --- /dev/null +++ b/src/catalog-protos/fuzz/fuzz_targets/catalog_objects_serde_roundtrip.rs @@ -0,0 +1,134 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: catalog object serde JSON round-trip is idempotent. The +//! catalog state is durable on-disk data, so a serde edge case that loses +//! information through a JSON round trip is a catalog-corruption risk. +//! +//! Two complementary input arms (the first byte picks the arm, the next byte +//! picks which catalog type to exercise): +//! +//! * **Structured arm.** Drives the catalog type's proptest `Arbitrary` +//! (behind mz-catalog-protos' `proptest` feature) from the libFuzzer byte +//! stream to synthesize a *valid, deeply-populated* value, then asserts the +//! full `value -> JSON -> value -> JSON` chain is idempotent. We deliberately +//! target the genuinely nested catalog types: `ClusterValue` +//! (`RoleId` + `Vec` + the `ClusterConfig`/`ClusterVariant`/ +//! `ManagedCluster`/`ClusterSchedule` tree), `ItemValue` (the `CatalogItem` +//! enum + `GlobalId` enum + `Vec`), `RoleValue` (the +//! `RoleAttributes`/`RoleMembership`/`RoleVars`/`RoleVar` tree), +//! `NetworkPolicyValue` (`Vec`), and `ClusterReplicaValue` +//! (the `ReplicaConfig`/`ReplicaLocation` enum). Random JSON bytes almost +//! never reach these inner enum variants, so this is where the interesting +//! serde branches actually get covered. +//! * **Raw-bytes arm.** Deserializes arbitrary bytes straight into the type, +//! exercising the deserializer against malformed/adversarial JSON input, +//! then re-serializes the recovered value. + +#![no_main] + +use libfuzzer_sys::fuzz_target; +use mz_catalog_protos::objects::{ + ClusterConfig, ClusterReplicaValue, ClusterValue, ConfigValue, GidMappingValue, ItemValue, + MzAclItem, NetworkPolicyValue, RoleId, RoleValue, SettingValue, +}; +use proptest::strategy::{Strategy, ValueTree}; +use proptest::test_runner::{Config, RngAlgorithm, TestRng, TestRunner}; + +/// Build a 32-byte proptest seed from `bytes` (zero-padded / truncated). +fn seed_from(bytes: &[u8]) -> [u8; 32] { + let mut seed = [0u8; 32]; + let n = bytes.len().min(32); + seed[..n].copy_from_slice(&bytes[..n]); + seed +} + +/// Synthesize a valid `T` via its proptest `Arbitrary`, then assert the serde +/// JSON round trip is idempotent. +fn structured_roundtrip(seed: &[u8]) +where + T: serde::de::DeserializeOwned + + serde::Serialize + + PartialEq + + std::fmt::Debug + + proptest::arbitrary::Arbitrary, +{ + let mut runner = TestRunner::new_with_rng( + Config::default(), + TestRng::from_seed(RngAlgorithm::ChaCha, &seed_from(seed)), + ); + let Ok(tree) = T::arbitrary().new_tree(&mut runner) else { + return; + }; + assert_idempotent(tree.current()); +} + +/// `value -> JSON -> value` must be the identity, and re-serializing must +/// produce byte-identical JSON. +fn assert_idempotent(orig: T) +where + T: serde::de::DeserializeOwned + serde::Serialize + PartialEq + std::fmt::Debug, +{ + let json = serde_json::to_vec(&orig).expect("serialize of valid value must succeed"); + let round: T = serde_json::from_slice(&json).expect("re-decode must round-trip"); + assert_eq!(orig, round, "serde roundtrip changed value"); + let json2 = serde_json::to_vec(&round).expect("re-serialize must succeed"); + assert_eq!(json, json2, "serde re-serialize was not idempotent"); +} + +/// Decode adversarial JSON bytes straight into `T`, then assert the recovered +/// value round-trips. +fn raw_roundtrip(data: &[u8]) +where + T: serde::de::DeserializeOwned + serde::Serialize + PartialEq + std::fmt::Debug, +{ + let Ok(orig) = serde_json::from_slice::(data) else { + return; + }; + assert_idempotent(orig); +} + +fuzz_target!(|data: &[u8]| { + let Some((&mode, rest)) = data.split_first() else { + return; + }; + let Some((&which, rest)) = rest.split_first() else { + return; + }; + + if mode & 1 == 0 { + // Structured arm: synthesize a valid, deeply-nested value. + match which % 10 { + 0 => structured_roundtrip::(rest), + 1 => structured_roundtrip::(rest), + 2 => structured_roundtrip::(rest), + 3 => structured_roundtrip::(rest), + 4 => structured_roundtrip::(rest), + 5 => structured_roundtrip::(rest), + 6 => structured_roundtrip::(rest), + 7 => structured_roundtrip::(rest), + 8 => structured_roundtrip::(rest), + _ => structured_roundtrip::(rest), + } + } else { + // Raw-bytes arm: decode adversarial JSON, then round-trip. + match which % 10 { + 0 => raw_roundtrip::(rest), + 1 => raw_roundtrip::(rest), + 2 => raw_roundtrip::(rest), + 3 => raw_roundtrip::(rest), + 4 => raw_roundtrip::(rest), + 5 => raw_roundtrip::(rest), + 6 => raw_roundtrip::(rest), + 7 => raw_roundtrip::(rest), + 8 => raw_roundtrip::(rest), + _ => raw_roundtrip::(rest), + } + } +}); diff --git a/src/compute/src/render/context.rs b/src/compute/src/render/context.rs index fabd2ba318718..75a0aed50bead 100644 --- a/src/compute/src/render/context.rs +++ b/src/compute/src/render/context.rs @@ -938,7 +938,13 @@ impl<'scope, T: RenderTimestamp> CollectionBundle<'scope, T> { VecCollection<'scope, T, DataflowErrorSer, Diff>, ) { mfp.optimize(); - let mfp_plan = mfp.clone().into_plan().unwrap(); + // NOTE: only an un-extractable temporal predicate makes `into_plan` fail; + // `ExprPrepMaintained` rejects those at plan time and peeks materialize + // `mz_now()`, so none reach render. + let mfp_plan = mfp + .clone() + .into_plan() + .expect("temporal predicate extractable"); // If the MFP is trivial, we can just call `as_collection`. // In the case that we weren't going to apply the `key_val` optimization, @@ -959,7 +965,7 @@ impl<'scope, T: RenderTimestamp> CollectionBundle<'scope, T> { let max_demand = mfp.demand().last().map(|x| *x + 1).unwrap_or(0); mfp.permute_fn(|c| c, max_demand); mfp.optimize(); - let mfp_plan = mfp.into_plan().unwrap(); + let mfp_plan = mfp.into_plan().expect("temporal predicate extractable"); let mut datum_vec = DatumVec::new(); // Wrap in an `Rc` so that lifetimes work out. diff --git a/src/expr/fuzz/.gitignore b/src/expr/fuzz/.gitignore new file mode 100644 index 0000000000000..ab0eaa1a49031 --- /dev/null +++ b/src/expr/fuzz/.gitignore @@ -0,0 +1,5 @@ +target/ +corpus/ +artifacts/ +coverage/ +Cargo.lock diff --git a/src/expr/fuzz/Cargo.toml b/src/expr/fuzz/Cargo.toml new file mode 100644 index 0000000000000..1fdc0d92d51a1 --- /dev/null +++ b/src/expr/fuzz/Cargo.toml @@ -0,0 +1,104 @@ +# Fuzz crate for mz-expr proto round-trip properties. +# +# Excluded from the main workspace because libFuzzer requires nightly Rust. +# Run via the repo-wide runner: `bin/ci-builder run nightly ci/test/cargo-fuzz.sh`, +# or locally: +# cd src/expr/fuzz +# cargo +nightly fuzz run eval_error_proto_roundtrip -- -max_total_time=60 + +[package] +workspace = "../../../test/cargo-fuzz" +name = "mz-expr-fuzz" +version = "0.0.0" +publish = false +edition = "2021" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" +mz-expr = { path = "..", features = ["proptest"] } +mz-repr = { path = "../../repr" } +mz-pgtz = { path = "../../pgtz" } +chrono = { version = "0.4", default-features = false } +mz-proto = { path = "../../proto" } +prost = "0.14.3" +proptest = "1" + +[[bin]] +name = "eval_error_proto_roundtrip" +path = "fuzz_targets/eval_error_proto_roundtrip.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "like_pattern_compile" +path = "fuzz_targets/like_pattern_compile.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "mir_scalar_reduce" +path = "fuzz_targets/mir_scalar_reduce.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "mfp_optimize" +path = "fuzz_targets/mfp_optimize.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "agg_decompose" +path = "fuzz_targets/agg_decompose.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "build_regex" +path = "fuzz_targets/build_regex.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "timezone_convert" +path = "fuzz_targets/timezone_convert.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "jsonb_get" +path = "fuzz_targets/jsonb_get.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "like_pattern_escape" +path = "fuzz_targets/like_pattern_escape.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "cast_string" +path = "fuzz_targets/cast_string.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "jsonb_path" +path = "fuzz_targets/jsonb_path.rs" +test = false +doc = false +bench = false diff --git a/src/expr/fuzz/fuzz_targets/agg_decompose.rs b/src/expr/fuzz/fuzz_targets/agg_decompose.rs new file mode 100644 index 0000000000000..95507c9dee885 --- /dev/null +++ b/src/expr/fuzz/fuzz_targets/agg_decompose.rs @@ -0,0 +1,324 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: aggregate functions must obey the decomposition laws that +//! Materialize's *incremental* reduce relies on. The dataflow engine never +//! re-aggregates a group from scratch. It maintains aggregates by combining +//! partial results, so each aggregate must satisfy: +//! +//! * **Permutation invariance** (every order-insensitive aggregate): the result +//! must not depend on the order the inputs arrive in. Accumulable maintenance +//! (sum, count) applies updates in arrival order, so any order-dependence is a +//! correctness bug. +//! * **Hierarchical re-aggregation** (min/max/any/all, idempotent aggregates +//! whose output type equals their input type): `agg(whole)` must equal +//! `agg([agg(chunk0), agg(chunk1), ...])` over the non-empty chunks of an +//! arbitrary partition. This is exactly how the bucketed hierarchical reduce +//! computes min/max, and it must hold even with nulls present in the data. +//! * **Additive decomposition**: `count(whole)` must equal the sum of the +//! per-chunk counts. Likewise `sum(whole)` must equal the sum of the +//! per-chunk sums (this is exactly how accumulable maintenance combines +//! partial sums across batches). We check the sum law for the *integer* sums +//! (`SumInt32`/`SumInt64`), where the per-chunk combination is exact. +//! +//! We generate a random multiset of nullable datums in one of several type +//! groups, a random permutation of it, and a random partition into chunks, then +//! check the applicable laws for each aggregate over the chosen group. +//! +//! Groups: `int4`/`int8`/`bool` (exact integers/booleans, plain equality +//! oracle), `text` (lexicographic min/max), plus `float8` and `numeric`. The +//! float/numeric groups exercise the `OrderedFloat`/`OrderedDecimal` ordering +//! used by min/max and feed in the tricky special values (`NaN`, `±Inf`, +//! `-0.0`). We only apply min/max to the float/numeric groups: floating-point +//! and bounded-decimal *sum* is not +//! associative under rounding, so an additive/permutation law over it would be +//! a generator artifact rather than a real product invariant. Datum equality is +//! `OrderedFloat`-based (so `NaN == NaN`), so it is a sound oracle for the +//! ordering aggregates even with NaN present. + +#![no_main] + +use libfuzzer_sys::arbitrary::{self, Arbitrary, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_expr::AggregateFunc; +use mz_repr::{Datum, Diff, RowArena}; + +const MAX_ROWS: usize = 24; +const MAX_CHUNKS: usize = 4; + +#[derive(Clone, Copy)] +enum Group { + Int32, + Int64, + Bool, + Float64, + Numeric, + Str, +} + +/// A small fixed pool of `'static` strings for the text min/max group. Keeping +/// them `'static` lets text datums flow through the same `Datum<'static>` +/// shuffle/partition machinery as the scalar groups. The set is deliberately +/// tiny so duplicates and ties are common (which is what stresses min/max's +/// hierarchical tie-breaking), and includes the empty string and a prefix pair +/// (`"a"`/`"ab"`) where lexicographic ordering is subtle. +const POOL_STR: &[&str] = &["", "a", "ab", "abc", "b", "Z", "z", "10", "9"]; + +/// Which decomposition laws apply to an aggregate (permutation invariance always +/// applies and is checked separately). +enum Law { + /// Idempotent, output type == input type: `agg(whole) == agg(map(agg, parts))`. + Hierarchical, + /// `count(whole) == sum(map(count, parts))`. + AdditiveCount, + /// `sum(whole) == sum(map(sum, parts))`, with null partials (empty/all-null + /// chunks) skipped. Only sound for the exact-integer sums. + AdditiveSum, +} + +/// Float values worth probing: ordinary magnitudes plus the IEEE-754 corners +/// that the `OrderedFloat` total order has to canonicalize (NaN as the maximum, +/// the two infinities, and signed zeros, where `-0.0 == 0.0` but distinct bits). +const SPECIAL_F64: &[f64] = &[ + 0.0, + -0.0, + 1.0, + -1.0, + f64::INFINITY, + f64::NEG_INFINITY, + f64::NAN, + f64::MIN, + f64::MAX, +]; + +fn gen_datum(u: &mut Unstructured, group: Group) -> arbitrary::Result> { + if u.ratio(1u8, 5u8)? { + return Ok(Datum::Null); + } + Ok(match group { + Group::Int32 => Datum::Int32(i32::arbitrary(u)?), + Group::Int64 => Datum::Int64(i64::arbitrary(u)?), + Group::Bool => { + if bool::arbitrary(u)? { + Datum::True + } else { + Datum::False + } + } + Group::Float64 => { + // Bias toward the special values so min/max actually has to order + // NaN/Inf/-0.0 against ordinary numbers. Otherwise a fully random + // f64 almost never lands on a corner case. + let f = if u.ratio(1u8, 2u8)? { + let i = u.int_in_range(0..=SPECIAL_F64.len() - 1)?; + SPECIAL_F64[i] + } else { + f64::arbitrary(u)? + }; + Datum::from(f) + } + Group::Numeric => { + // Integer-valued numerics keep min/max exact and easy to read. The + // point of the group is the `OrderedDecimal` comparison path, not + // fractional precision. + Datum::from(i128::from(i64::arbitrary(u)?)) + } + Group::Str => { + let i = u.int_in_range(0..=POOL_STR.len() - 1)?; + Datum::String(POOL_STR[i]) + } + }) +} + +fn aggregates(group: Group) -> Vec<(AggregateFunc, Law)> { + match group { + Group::Int32 => vec![ + (AggregateFunc::MaxInt32, Law::Hierarchical), + (AggregateFunc::MinInt32, Law::Hierarchical), + (AggregateFunc::SumInt32, Law::AdditiveSum), + (AggregateFunc::Count, Law::AdditiveCount), + ], + Group::Int64 => vec![ + (AggregateFunc::MaxInt64, Law::Hierarchical), + (AggregateFunc::MinInt64, Law::Hierarchical), + (AggregateFunc::SumInt64, Law::AdditiveSum), + (AggregateFunc::Count, Law::AdditiveCount), + ], + Group::Bool => vec![ + (AggregateFunc::MaxBool, Law::Hierarchical), + (AggregateFunc::MinBool, Law::Hierarchical), + (AggregateFunc::Any, Law::Hierarchical), + (AggregateFunc::All, Law::Hierarchical), + (AggregateFunc::Count, Law::AdditiveCount), + ], + // Float/numeric sum is not exactly associative under rounding, so we only + // assert the ordering laws (min/max are total-order selections and stay + // exact, NaN included) and the count law. + Group::Float64 => vec![ + (AggregateFunc::MaxFloat64, Law::Hierarchical), + (AggregateFunc::MinFloat64, Law::Hierarchical), + (AggregateFunc::Count, Law::AdditiveCount), + ], + Group::Numeric => vec![ + (AggregateFunc::MaxNumeric, Law::Hierarchical), + (AggregateFunc::MinNumeric, Law::Hierarchical), + (AggregateFunc::Count, Law::AdditiveCount), + ], + // Text min/max select by lexicographic byte order, output type == input + // type, so the hierarchical re-aggregation law applies. + Group::Str => vec![ + (AggregateFunc::MaxString, Law::Hierarchical), + (AggregateFunc::MinString, Law::Hierarchical), + (AggregateFunc::Count, Law::AdditiveCount), + ], + } +} + +/// A Fisher-Yates shuffle driven by the fuzz input. +fn shuffle( + u: &mut Unstructured, + input: &[Datum<'static>], +) -> arbitrary::Result>> { + let mut v = input.to_vec(); + for i in (1..v.len()).rev() { + let j = u.int_in_range(0..=i)?; + v.swap(i, j); + } + Ok(v) +} + +/// Randomly assign each input to one of `1..=MAX_CHUNKS` chunks (some may be +/// empty). The chunks' concatenation is a permutation of the input multiset. +fn partition( + u: &mut Unstructured, + input: &[Datum<'static>], +) -> arbitrary::Result>>> { + let k = u.int_in_range(1usize..=MAX_CHUNKS)?; + let mut chunks = vec![Vec::new(); k]; + for &d in input { + let b = u.int_in_range(0..=k - 1)?; + chunks[b].push(d); + } + Ok(chunks) +} + +fn as_count(d: Datum) -> i64 { + match d { + Datum::Int64(n) => n, + other => panic!("Count produced a non-int8 datum: {other:?}"), + } +} + +/// Decode an integer-sum result datum to an exact `i128`. `SumInt32` yields an +/// `Int64`, `SumInt64` yields an (integer-valued) `Numeric`. An empty/all-null +/// chunk yields `Null` (returned as `None`). The values are bounded (<= 24 +/// inputs of at most i64 magnitude), so every partial sum fits an `i128` +/// exactly and the per-chunk combination below is lossless. +fn as_sum(d: Datum) -> Option { + match d { + Datum::Null => None, + Datum::Int64(n) => Some(i128::from(n)), + Datum::Numeric(n) => { + Some(i128::try_from(n.0).expect("integer-valued sum must fit an i128")) + } + other => panic!("integer sum produced an unexpected datum: {other:?}"), + } +} + +fn run(u: &mut Unstructured) -> arbitrary::Result<()> { + let group = match u.int_in_range(0u8..=5)? { + 0 => Group::Int32, + 1 => Group::Int64, + 2 => Group::Bool, + 3 => Group::Float64, + 4 => Group::Numeric, + _ => Group::Str, + }; + + let n = u.int_in_range(0usize..=MAX_ROWS)?; + let mut input = Vec::with_capacity(n); + for _ in 0..n { + input.push(gen_datum(u, group)?); + } + let permuted = shuffle(u, &input)?; + let chunks = partition(u, &input)?; + + let arena = RowArena::new(); + for (agg, law) in aggregates(group) { + // `AggregateFunc::eval` consumes `(datum, multiplicity)` pairs. Each + // generated row has multiplicity one, so pair every datum with + // `Diff::ONE`. + let whole = agg.eval(input.iter().map(|&d| (d, Diff::ONE)), &arena); + + // Permutation invariance: order must never matter. + let shuffled = agg.eval(permuted.iter().map(|&d| (d, Diff::ONE)), &arena); + assert_eq!( + whole, shuffled, + "{agg:?} is not permutation-invariant\n input = {input:?}\n permuted = {permuted:?}" + ); + + match law { + Law::Hierarchical => { + // agg(whole) == agg(map(agg, non-empty chunks)). Empty chunks are + // skipped: an empty chunk aggregates to null, and for any/all + // (three-valued) a stray null would corrupt an otherwise false/true + // result (`false OR null = null`). Min/max absorb null as their + // identity, but skipping is correct for all of them. + let partials: Vec = chunks + .iter() + .filter(|c| !c.is_empty()) + .map(|c| agg.eval(c.iter().map(|&d| (d, Diff::ONE)), &arena)) + .collect(); + let reaggregated = agg.eval(partials.iter().map(|&d| (d, Diff::ONE)), &arena); + assert_eq!( + whole, reaggregated, + "{agg:?} fails hierarchical re-aggregation\n input = {input:?}\n chunks = {chunks:?}\n partials = {partials:?}" + ); + } + Law::AdditiveCount => { + // count(whole) == sum(map(count, chunks)) + let total: i64 = chunks + .iter() + .map(|c| as_count(agg.eval(c.iter().map(|&d| (d, Diff::ONE)), &arena))) + .sum(); + assert_eq!( + as_count(whole), + total, + "{agg:?} fails additive decomposition\n input = {input:?}\n chunks = {chunks:?}" + ); + } + Law::AdditiveSum => { + // sum(whole) == sum(map(sum, chunks)), combining the non-null + // per-chunk partials. A chunk that is empty or all-null sums to + // Null and contributes nothing. If *every* element is null the + // whole is also Null, so both sides are "no partials" and match. + let partials: Vec = chunks + .iter() + .filter_map(|c| as_sum(agg.eval(c.iter().map(|&d| (d, Diff::ONE)), &arena))) + .collect(); + let combined: Option = if partials.is_empty() { + None + } else { + Some(partials.iter().sum()) + }; + assert_eq!( + as_sum(whole), + combined, + "{agg:?} fails additive sum decomposition\n input = {input:?}\n chunks = {chunks:?}" + ); + } + } + } + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let mut u = Unstructured::new(data); + let _ = run(&mut u); +}); diff --git a/src/expr/fuzz/fuzz_targets/build_regex.rs b/src/expr/fuzz/fuzz_targets/build_regex.rs new file mode 100644 index 0000000000000..aa7f96bf281ea --- /dev/null +++ b/src/expr/fuzz/fuzz_targets/build_regex.rs @@ -0,0 +1,209 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `func::build_regex` compiles an untrusted regular expression +//! (and flags) for the `regexp_*` SQL functions, and the result matches +//! untrusted text. A user controls both, so a panic compiling or matching is a +//! real availability bug. (The `regex` crate is linear-time and size-limited, +//! so an oversized pattern returns an error rather than hanging or OOMing.) +//! +//! A random pattern often compiles, but random *text* rarely matches it, so the +//! match-found paths (capture extraction, `replace_all`, `split` on real splits) +//! stay under-exercised. So most inputs are generated: a structured pattern over +//! the regex feature set (classes, groups, alternation, quantifiers, anchors, +//! escapes) drawn from a small alphabet, plus text drawn from that *same* +//! alphabet so matches actually fire. A quarter of inputs are still raw +//! pattern/text bytes so the compiler's parse/reject paths keep their coverage. +//! +//! Beyond compile+match, we also exercise: +//! * `replace_all` with a replacement string carrying capture references +//! (`$1`, `${name}`, `$0`, a literal `$$`), so the regex crate's replacement +//! interpolation runs against the captures the generated text actually fills. +//! * occasional near-size-limit patterns built from nested counted quantifiers, +//! which push the compiler toward its state-count / size limit (where it must +//! return an error rather than hang or OOM) and toward deep AST nesting. + +#![no_main] + +use libfuzzer_sys::arbitrary::{self, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_expr::func; + +/// Shared alphabet for pattern literals and text, so generated text tends to +/// match the generated pattern. +const ALPHA: &[char] = &['a', 'b', 'c', '0', '1', ' ']; + +/// Return the longest prefix of `s` with at most `max` chars. +fn cap(s: &str, max: usize) -> &str { + match s.char_indices().nth(max) { + Some((i, _)) => &s[..i], + None => s, + } +} + +fn maybe_quantifier(u: &mut Unstructured, out: &mut String) -> arbitrary::Result<()> { + match u.int_in_range(0u8..=5)? { + 0 => out.push('*'), + 1 => out.push('+'), + 2 => out.push('?'), + 3 => out.push_str(&format!("{{{}}}", u.int_in_range(0u32..=5)?)), + 4 => out.push_str(&format!("{{{},}}", u.int_in_range(0u32..=3)?)), + _ => out.push_str(&format!( + "{{{},{}}}", + u.int_in_range(0u32..=2)?, + u.int_in_range(2u32..=5)? + )), + } + if u.int_in_range(0u8..=2)? == 0 { + out.push('?'); // lazy + } + Ok(()) +} + +fn gen_regex( + u: &mut Unstructured, + depth: u32, + name_id: &mut u32, + out: &mut String, +) -> arbitrary::Result<()> { + if depth == 0 || u.is_empty() || out.len() > 256 { + match u.int_in_range(0u8..=3)? { + 0 => out.push(*u.choose(ALPHA)?), + 1 => out.push('.'), + 2 => out.push_str(u.choose(&["\\d", "\\w", "\\s", "\\b", "\\.", "\\*", "\\p{L}"])?), + _ => { + out.push('['); + if u.int_in_range(0u8..=2)? == 0 { + out.push('^'); + } + for _ in 0..u.int_in_range(1usize..=3)? { + out.push(*u.choose(ALPHA)?); + } + if u.int_in_range(0u8..=1)? == 0 { + out.push_str("a-c"); + } + out.push(']'); + } + } + return Ok(()); + } + let d = depth - 1; + match u.int_in_range(0u8..=5)? { + 0 => gen_regex(u, d, name_id, out)?, + 1 => { + for _ in 0..u.int_in_range(2usize..=3)? { + gen_regex(u, d, name_id, out)?; + } + } + 2 => { + gen_regex(u, d, name_id, out)?; + out.push('|'); + gen_regex(u, d, name_id, out)?; + } + 3 => { + out.push('('); + // Mix non-capturing, plain-capturing, and named-capturing groups so + // both `$N` and `${name}` replacement refs resolve to real captures. + // Names must be unique, so derive one from a per-pattern counter. + match u.int_in_range(0u8..=2)? { + 0 => out.push_str("?:"), + 1 => { + out.push_str(&format!("?P", *name_id)); + *name_id += 1; + } + _ => {} + } + gen_regex(u, d, name_id, out)?; + out.push(')'); + maybe_quantifier(u, out)?; + } + 4 => { + gen_regex(u, d, name_id, out)?; + maybe_quantifier(u, out)?; + } + _ => { + if u.int_in_range(0u8..=1)? == 0 { + out.push('^'); + } + gen_regex(u, d, name_id, out)?; + if u.int_in_range(0u8..=1)? == 0 { + out.push('$'); + } + } + } + Ok(()) +} + +/// Builds a deeply nested chain of counted quantifiers whose multiplied bounds +/// approach the regex crate's compiled-size limit, e.g. `(?:(?:a{40}){40}){40}`. +/// `build_regex` must reject this with an error (PatternTooLarge or the regex +/// crate's CompiledTooBig) rather than hang or OOM. +fn gen_near_limit(u: &mut Unstructured, out: &mut String) -> arbitrary::Result<()> { + let layers = u.int_in_range(2usize..=5)?; + for _ in 0..layers { + out.push_str("(?:"); + } + out.push(*u.choose(ALPHA)?); + for _ in 0..layers { + out.push_str(&format!("){{{}}}", u.int_in_range(2u32..=60)?)); + } + Ok(()) +} + +/// A replacement string mixing several capture-reference forms so the regex +/// crate's interpolation runs against whatever captures the match produced: +/// numbered (`$1`), named-braced (`${g0}`), the whole match (`$0`), a literal +/// `$$`, and an out-of-range index (`$99`, which interpolates to empty). +const REPLACEMENT: &str = "x$1-${g0}-$0-$$-$99y"; + +fn drive(pattern: &str, flags: &str, text: &str) { + let pattern = cap(pattern, 4096); + let text = cap(text, 4096); + let Ok(regex) = func::build_regex(pattern, flags) else { + return; + }; + let _ = regex.is_match(text); + let _ = regex.find(text); + let _ = regex.captures(text); + // Plain deletion plus capture-ref interpolation (exercises the replacement + // parser and capture-group lookup against real matches). + let _ = regex.replace_all(text, ""); + let _ = regex.replace_all(text, REPLACEMENT); + let _ = regex.split(text).count(); +} + +fn run(mut u: Unstructured) -> arbitrary::Result<()> { + let flags = *u.choose(&["", "i", "c", "ic", "ci"])?; + // A quarter of the time, raw pattern/text bytes (the compiler's parse paths). + if u.int_in_range(0u8..=3)? == 0 { + let pattern: String = u.arbitrary()?; + let text: String = u.arbitrary()?; + drive(&pattern, flags, &text); + return Ok(()); + } + let mut pattern = String::new(); + // Occasionally emit a near-size-limit nested-quantifier pattern (the + // compiler must reject it cleanly). Otherwise the structured generator. + if u.int_in_range(0u8..=7)? == 0 { + gen_near_limit(&mut u, &mut pattern)?; + } else { + let mut name_id = 0u32; + gen_regex(&mut u, 3, &mut name_id, &mut pattern)?; + } + let mut text = String::new(); + for _ in 0..u.int_in_range(0usize..=24)? { + text.push(*u.choose(ALPHA)?); + } + drive(&pattern, flags, &text); + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let _ = run(Unstructured::new(data)); +}); diff --git a/src/expr/fuzz/fuzz_targets/cast_string.rs b/src/expr/fuzz/fuzz_targets/cast_string.rs new file mode 100644 index 0000000000000..c91eb7b8810c8 --- /dev/null +++ b/src/expr/fuzz/fuzz_targets/cast_string.rs @@ -0,0 +1,163 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: the string->scalar cast functions parse untrusted text into a +//! typed value (`'...'::bigint`, `'...'::uuid`). These are hand-written parsers +//! reached directly from user SQL, so a panic on crafted input is an +//! availability bug. `cast_string_to_uuid` in particular wraps the `uuid` +//! crate, whose parser has panicked on malformed input before. We evaluate each +//! cast on the text and require it to return an `EvalError`, not panic. +//! +//! Purely random text almost never resembles a UUID or a boundary integer, so a +//! raw `&str` plateaus on the early-reject paths and barely reaches the parsers' +//! interesting branches. Instead we mostly *construct* the candidate string: +//! * UUID-shaped strings: 32 hex digits with dashes placed at the canonical +//! 8/12/16/20 offsets (and off-by-one offsets), optional `{...}` / `urn:uuid:` +//! wrappers, off-by-one digit counts, and an occasional embedded NUL, to +//! drive the hyphen-position / length / wrapper handling in the uuid parser. +//! * Boundary `int64` strings: values around `i64::{MIN,MAX}`, off-by-one past +//! them (overflow path), leading `+`/`-`/zeros/whitespace, and embedded NUL. +//! A fraction of inputs remain fully arbitrary text so the early-reject paths +//! keep their coverage. + +#![no_main] + +use libfuzzer_sys::arbitrary::{self, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_expr::{func, Eval, MirScalarExpr, UnaryFunc}; +use mz_repr::{Datum, ReprScalarType, RowArena}; + +const HEX: &[u8] = b"0123456789abcdefABCDEF"; + +/// Builds a UUID-shaped candidate string: hex body, dashes at chosen offsets, +/// optional wrapper, occasionally perturbed so it lands just off a valid form. +fn gen_uuid_like(u: &mut Unstructured) -> arbitrary::Result { + // Number of hex digits: usually 32 (canonical), sometimes off-by-one. + let n_hex = match u.int_in_range(0u8..=4)? { + 0 => 31usize, + 1 => 33, + _ => 32, + }; + let mut hex = String::with_capacity(n_hex); + for _ in 0..n_hex { + hex.push(*u.choose(HEX)? as char); + } + + // Dash placement: canonical 8/12/16/20, off-by-one, none, or scattered. + let body = match u.int_in_range(0u8..=4)? { + 0 => hex.clone(), // no dashes (uuid::simple form) + 1 => insert_dashes(&hex, &[8, 12, 16, 20]), + 2 => insert_dashes(&hex, &[8, 12, 16, 20]), // canonical, weighted + 3 => insert_dashes(&hex, &[7, 12, 16, 20]), // off-by-one first group + _ => { + // Scatter a few dashes at fuzzer-chosen positions. + let mut positions = Vec::new(); + for _ in 0..u.int_in_range(0usize..=5)? { + positions.push(u.int_in_range(0usize..=hex.len())?); + } + positions.sort_unstable(); + insert_dashes(&hex, &positions) + } + }; + + // Optional wrapper. + let mut s = match u.int_in_range(0u8..=3)? { + 0 => format!("{{{body}}}"), + 1 => format!("urn:uuid:{body}"), + 2 => format!("{{urn:uuid:{body}}}"), + _ => body, + }; + + // Occasionally embed a NUL or stray byte to probe parser robustness. + if u.int_in_range(0u8..=7)? == 0 { + let pos = u.int_in_range(0usize..=s.len())?; + // Find a char boundary at or before `pos`. + let pos = (0..=pos) + .rev() + .find(|&p| s.is_char_boundary(p)) + .unwrap_or(0); + s.insert(pos, '\0'); + } + Ok(s) +} + +/// Inserts `-` into `hex` at the given (ascending) char offsets. +fn insert_dashes(hex: &str, offsets: &[usize]) -> String { + let mut out = String::with_capacity(hex.len() + offsets.len()); + for (i, c) in hex.chars().enumerate() { + if offsets.contains(&i) { + out.push('-'); + } + out.push(c); + } + out +} + +/// Builds a boundary-ish int64 string: values near the i64 limits, just past +/// them, with leading sign/zeros/whitespace, or an embedded NUL. +fn gen_int_like(u: &mut Unstructured) -> arbitrary::Result { + let base: &str = u.choose(&[ + "0", + "-0", + "+0", + "9223372036854775807", // i64::MAX + "9223372036854775808", // i64::MAX + 1 (overflow) + "-9223372036854775808", // i64::MIN + "-9223372036854775809", // i64::MIN - 1 (overflow) + "99999999999999999999", // way past + "-1", + "+1", + ])?; + let mut s = String::new(); + // Optional leading whitespace / extra zeros / sign. + match u.int_in_range(0u8..=4)? { + 0 => s.push_str(" "), + 1 => s.push('\t'), + 2 => s.push('+'), + 3 => s.push_str("000"), + _ => {} + } + s.push_str(base); + // Optional trailing junk / NUL. + match u.int_in_range(0u8..=5)? { + 0 => s.push_str(" "), + 1 => s.push('\0'), + 2 => s.push('x'), + _ => {} + } + Ok(s) +} + +fn drive(s: &str) { + let arena = RowArena::new(); + let input = || MirScalarExpr::literal_ok(Datum::String(s), ReprScalarType::String); + + let _ = input() + .call_unary(UnaryFunc::CastStringToInt64(func::CastStringToInt64)) + .eval(&[], &arena); + let _ = input() + .call_unary(UnaryFunc::CastStringToUuid(func::CastStringToUuid)) + .eval(&[], &arena); +} + +fn run(mut u: Unstructured) -> arbitrary::Result<()> { + let s = match u.int_in_range(0u8..=4)? { + // Mostly UUID-shaped (the uuid crate is the higher-risk parser). + 0 | 1 | 2 => gen_uuid_like(&mut u)?, + 3 => gen_int_like(&mut u)?, + // A fraction fully arbitrary, preserving the early-reject coverage. + _ => u.arbitrary()?, + }; + drive(&s); + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let _ = run(Unstructured::new(data)); +}); diff --git a/src/expr/fuzz/fuzz_targets/eval_error_proto_roundtrip.rs b/src/expr/fuzz/fuzz_targets/eval_error_proto_roundtrip.rs new file mode 100644 index 0000000000000..e3ad1d4634199 --- /dev/null +++ b/src/expr/fuzz/fuzz_targets/eval_error_proto_roundtrip.rs @@ -0,0 +1,90 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `EvalError` must survive a proto re-encode + re-decode with the +//! same value, exercising lossy proto conversions where the wire form decodes +//! into a value that doesn't round-trip back to itself. +//! +//! Two input modes share the byte stream (the first byte selects the mode): +//! +//! * Mode A (Arbitrary): drive proptest's `Arbitrary` impl for `EvalError` +//! from the libFuzzer byte stream to synthesize a *valid, deeply nested* +//! `EvalError` (one of 80+ variants, several carrying `char`, `usize`, +//! nested `NumericMaxScale`/`InvalidArrayError`/`DomainLimit` invariants). +//! We then assert `from_proto(into_proto(v)) == v`. Random bytes decoded as +//! a proto almost always yield near-empty/default messages, so this arm is +//! what actually reaches the interesting variants and their narrowing / +//! validation logic (`char::from_proto`, `usize`/`u32` casts, etc.). +//! +//! * Mode B (raw bytes): decode the remaining bytes directly as +//! `ProtoEvalError` and, if it converts to Rust, assert it round-trips. +//! Kept for robustness against hand-crafted / malformed wire forms that the +//! structured generator would never produce. + +#![no_main] + +use libfuzzer_sys::fuzz_target; +use mz_expr::{EvalError, ProtoEvalError}; +use mz_proto::ProtoType; +use proptest::strategy::{Strategy, ValueTree}; +use proptest::test_runner::{Config, RngAlgorithm, TestRng, TestRunner}; +use prost::Message; + +/// Assert that a `EvalError` survives encode -> decode -> into_rust unchanged. +fn assert_roundtrip(orig: &EvalError) { + let proto = >::from_rust(orig); + let bytes = proto.encode_to_vec(); + let decoded = ProtoEvalError::decode(bytes.as_slice()) + .expect("re-encode of valid EvalError must decode"); + let round: EvalError = decoded + .into_rust() + .expect("re-encoded EvalError must convert back to Rust"); + assert_eq!(orig, &round, "EvalError changed across proto roundtrip"); +} + +/// Decode `data` directly as a `ProtoEvalError`. If it converts to a Rust +/// `EvalError`, assert that value round-trips. +fn raw_bytes_arm(data: &[u8]) { + let Ok(proto) = ProtoEvalError::decode(data) else { + return; + }; + let orig: EvalError = match proto.into_rust() { + Ok(v) => v, + Err(_) => return, + }; + assert_roundtrip(&orig); +} + +fuzz_target!(|data: &[u8]| { + let Some((&mode, rest)) = data.split_first() else { + return; + }; + + if mode & 1 == 0 { + // Mode A: structured Arbitrary generation. Derive a 32-byte seed from + // the remaining bytes (cycled / zero-padded) so even short inputs are + // usable, then drive proptest's Arbitrary impl from it. + let mut seed = [0u8; 32]; + if !rest.is_empty() { + for (i, slot) in seed.iter_mut().enumerate() { + *slot = rest[i % rest.len()]; + } + } + let rng = TestRng::from_seed(RngAlgorithm::ChaCha, &seed); + let mut runner = TestRunner::new_with_rng(Config::default(), rng); + let value = ::arbitrary() + .new_tree(&mut runner) + .expect("valuetree") + .current(); + assert_roundtrip(&value); + } else { + // Mode B: raw wire bytes. + raw_bytes_arm(rest); + } +}); diff --git a/src/expr/fuzz/fuzz_targets/jsonb_get.rs b/src/expr/fuzz/fuzz_targets/jsonb_get.rs new file mode 100644 index 0000000000000..cebff0629b2cb --- /dev/null +++ b/src/expr/fuzz/fuzz_targets/jsonb_get.rs @@ -0,0 +1,145 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: the jsonb access operators `->` / `->>` (field/element). A user +//! controls both the jsonb value (parsed from untrusted text) and the key/index, +//! and the operators traverse arbitrary nested structure, so a panic accessing +//! into a crafted document is an availability bug. We parse untrusted JSON, then +//! apply all four single-step accessors and require none to panic: +//! * `jsonb -> ''` and `jsonb -> ` (return jsonb) +//! * `jsonb ->> ''` and `jsonb ->> ` (the `Stringify` variants, which +//! additionally run `jsonb_stringify` on the accessed element, a distinct +//! text-rendering path over the same crafted sub-document). +//! +//! Random text almost never parses as JSON, so a raw input would just bail at +//! `from_str` and the access operators would barely run. We instead generate a +//! valid JSON document (objects with keys from a small set, arrays, scalars) and +//! generate the access key/index from that same set, so the accessors hit real +//! fields/elements (the success + traversal paths) as well as missing ones, and +//! the index includes out-of-range and extreme values for the array-bounds path. + +#![no_main] + +use std::str::FromStr; + +use libfuzzer_sys::arbitrary::{self, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_expr::{func, Eval, MirScalarExpr}; +use mz_repr::adt::jsonb::Jsonb; +use mz_repr::{Datum, ReprScalarType, RowArena}; + +/// Object keys, kept to a small set so generated access keys hit real fields. +const KEYS: &[&str] = &["a", "b", "c", "x"]; + +fn gen_json(u: &mut Unstructured, depth: u32, out: &mut String) -> arbitrary::Result<()> { + let leaf = depth == 0 || u.is_empty(); + match if leaf { + u.int_in_range(0u8..=3)? + } else { + u.int_in_range(0u8..=5)? + } { + 0 => out.push_str("null"), + 1 => out.push_str(if u.int_in_range(0u8..=1)? == 0 { + "true" + } else { + "false" + }), + 2 => out.push_str(&i64::from(u.arbitrary::()?).to_string()), + 3 => { + out.push('"'); + for _ in 0..u.int_in_range(0usize..=4)? { + out.push(*u.choose(&['a', 'z', '0', ' '])?); + } + out.push('"'); + } + 4 => { + out.push('['); + let n = u.int_in_range(0usize..=4)?; + for i in 0..n { + if i > 0 { + out.push(','); + } + gen_json(u, depth - 1, out)?; + } + out.push(']'); + } + _ => { + out.push('{'); + let n = u.int_in_range(0usize..=4)?; + for i in 0..n { + if i > 0 { + out.push(','); + } + out.push('"'); + out.push_str(u.choose(KEYS)?); + out.push_str("\":"); + gen_json(u, depth - 1, out)?; + } + out.push('}'); + } + } + Ok(()) +} + +fn run(mut u: Unstructured) -> arbitrary::Result<()> { + let mut json = String::new(); + gen_json(&mut u, 5, &mut json)?; + let Ok(jsonb) = Jsonb::from_str(&json) else { + return Ok(()); + }; + let value = jsonb.as_ref().into_datum(); + let arena = RowArena::new(); + + // Key: usually a real field name (hit), sometimes a miss / arbitrary string. + let key_buf; + let key: &str = if u.int_in_range(0u8..=2)? == 0 { + let n = u.int_in_range(0usize..=4)?; + let mut s = String::new(); + for _ in 0..n { + s.push(u.int_in_range(0x20u8..=0x7e)? as char); + } + key_buf = s; + &key_buf + } else { + u.choose(KEYS)? + }; + // Index: usually small (in/just-past array bounds), sometimes extreme. + let index: i64 = if u.int_in_range(0u8..=3)? == 0 { + u.arbitrary::()? + } else { + i64::from(u.int_in_range(-3i32..=8)?) + }; + + let key_expr = || MirScalarExpr::literal_ok(Datum::String(key), ReprScalarType::String); + let index_expr = || MirScalarExpr::literal_ok(Datum::Int64(index), ReprScalarType::Int64); + let jsonb_expr = || MirScalarExpr::literal_ok(value, ReprScalarType::Jsonb); + + // jsonb -> '' (object field access, returns jsonb) + let _ = jsonb_expr() + .call_binary(key_expr(), func::JsonbGetString) + .eval(&[], &arena); + // jsonb ->> '' (object field access, returns text via jsonb_stringify) + let _ = jsonb_expr() + .call_binary(key_expr(), func::JsonbGetStringStringify) + .eval(&[], &arena); + + // jsonb -> (array element access, returns jsonb) + let _ = jsonb_expr() + .call_binary(index_expr(), func::JsonbGetInt64) + .eval(&[], &arena); + // jsonb ->> (array element access, returns text via jsonb_stringify) + let _ = jsonb_expr() + .call_binary(index_expr(), func::JsonbGetInt64Stringify) + .eval(&[], &arena); + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let _ = run(Unstructured::new(data)); +}); diff --git a/src/expr/fuzz/fuzz_targets/jsonb_path.rs b/src/expr/fuzz/fuzz_targets/jsonb_path.rs new file mode 100644 index 0000000000000..9750217628710 --- /dev/null +++ b/src/expr/fuzz/fuzz_targets/jsonb_path.rs @@ -0,0 +1,176 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: the jsonb path-access operators `#>` / `#>>`. A user controls +//! both the jsonb value (parsed from untrusted text) and a `text[]` path, and +//! the operators walk the path into arbitrary nested structure, so a panic on a +//! crafted document/path is an availability bug. We parse untrusted JSON, build +//! a `text[]` path from the fuzzed components, then apply `jsonb #> path` +//! (returns jsonb) and `jsonb #>> path` (returns text), requiring neither to +//! panic. Complements `jsonb_get`, which covers the single-step `->` / `->>` +//! operators. +//! +//! Random text almost never parses as JSON, so a raw input would bail at +//! `from_str` and the path walk would barely run. We instead generate a valid +//! JSON document (objects with keys from a small set, arrays, scalars) and a +//! path whose components are drawn from that same set, so the walk descends +//! through real nested structure (the multi-step traversal the operators exist +//! for), with some missing components and numeric-looking array indices mixed in. +//! +//! Array-index components are not just `"0".."3"`: the list step runs +//! `strconv::parse_int64` on the component and, for a negative index, computes +//! `list.len().wrapping_sub(index)` to count from the end. So we also emit +//! negative (`"-1"`), out-of-range/huge (`"99999999999"`), boundary +//! (`"-9223372036854775808"` = `i64::MIN`, whose `unsigned_abs` is one past +//! `i64::MAX`), and malformed-but-numeric-looking (`"+0"`, `" 1"`, `"1.0"`, +//! `"0x10"`) strings, covering the parse-failure short-circuit and the +//! wrapping-subtraction index arithmetic. + +#![no_main] + +use std::str::FromStr; + +use libfuzzer_sys::arbitrary::{self, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_expr::{func, Eval, MirScalarExpr}; +use mz_repr::adt::array::{ArrayDimension, InvalidArrayError}; +use mz_repr::adt::jsonb::Jsonb; +use mz_repr::{Datum, ReprScalarType, RowArena}; + +/// Object keys / path components, kept small so the path hits real fields. +const KEYS: &[&str] = &["a", "b", "c", "x"]; + +/// Numeric-looking array-index components exercising the list step's +/// `parse_int64` + negative-index `wrapping_sub` arithmetic: in-range, +/// negative (count-from-end), huge/out-of-range, the `i64::MIN` boundary, and +/// malformed-but-numeric-looking strings that `parse_int64` should reject. +const INDICES: &[&str] = &[ + "0", + "1", + "2", + "3", + "-1", + "-2", + "99999999999", + "-99999999999", + "9223372036854775807", // i64::MAX + "-9223372036854775808", // i64::MIN (unsigned_abs is i64::MAX + 1) + "+0", + " 1", + "1.0", + "0x10", + "", +]; + +fn gen_json(u: &mut Unstructured, depth: u32, out: &mut String) -> arbitrary::Result<()> { + let leaf = depth == 0 || u.is_empty(); + match if leaf { + u.int_in_range(0u8..=3)? + } else { + u.int_in_range(0u8..=5)? + } { + 0 => out.push_str("null"), + 1 => out.push_str(if u.int_in_range(0u8..=1)? == 0 { + "true" + } else { + "false" + }), + 2 => out.push_str(&i64::from(u.arbitrary::()?).to_string()), + 3 => { + out.push('"'); + for _ in 0..u.int_in_range(0usize..=4)? { + out.push(*u.choose(&['a', 'z', '0', ' '])?); + } + out.push('"'); + } + 4 => { + out.push('['); + let n = u.int_in_range(0usize..=4)?; + for i in 0..n { + if i > 0 { + out.push(','); + } + gen_json(u, depth - 1, out)?; + } + out.push(']'); + } + _ => { + out.push('{'); + let n = u.int_in_range(0usize..=4)?; + for i in 0..n { + if i > 0 { + out.push(','); + } + out.push('"'); + out.push_str(u.choose(KEYS)?); + out.push_str("\":"); + gen_json(u, depth - 1, out)?; + } + out.push('}'); + } + } + Ok(()) +} + +fn run(mut u: Unstructured) -> arbitrary::Result<()> { + let mut json = String::new(); + gen_json(&mut u, 5, &mut json)?; + let Ok(jsonb) = Jsonb::from_str(&json) else { + return Ok(()); + }; + let value = jsonb.as_ref().into_datum(); + let arena = RowArena::new(); + + // Path components: object keys (hits), array indices ("0".."3"), and an + // occasional miss, so the walk descends real structure and also dead-ends. + let n = u.int_in_range(0usize..=5)?; + let mut path: Vec<&str> = Vec::with_capacity(n); + for _ in 0..n { + path.push(match u.int_in_range(0u8..=4)? { + 0 | 1 => u.choose(KEYS)?, + 2 | 3 => *u.choose(INDICES)?, + _ => "missing", + }); + } + + let dims = if path.is_empty() { + Vec::new() + } else { + vec![ArrayDimension { + lower_bound: 1, + length: path.len(), + }] + }; + let path_datum = match arena.try_make_datum::<_, InvalidArrayError>(|packer| { + packer.try_push_array(&dims, path.iter().map(|s| Datum::String(s))) + }) { + Ok(d) => d, + Err(_) => return Ok(()), + }; + let path_ty = ReprScalarType::Array(Box::new(ReprScalarType::String)); + + // jsonb #> '{a,b,...}' (returns jsonb) + let get_path = MirScalarExpr::literal_ok(value, ReprScalarType::Jsonb).call_binary( + MirScalarExpr::literal_ok(path_datum, path_ty.clone()), + func::JsonbGetPath, + ); + let _ = get_path.eval(&[], &arena); + + // jsonb #>> '{a,b,...}' (returns text) + let get_path_stringify = MirScalarExpr::literal_ok(value, ReprScalarType::Jsonb).call_binary( + MirScalarExpr::literal_ok(path_datum, path_ty), + func::JsonbGetPathStringify, + ); + let _ = get_path_stringify.eval(&[], &arena); + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let _ = run(Unstructured::new(data)); +}); diff --git a/src/expr/fuzz/fuzz_targets/like_pattern_compile.rs b/src/expr/fuzz/fuzz_targets/like_pattern_compile.rs new file mode 100644 index 0000000000000..119c00fe5e15f --- /dev/null +++ b/src/expr/fuzz/fuzz_targets/like_pattern_compile.rs @@ -0,0 +1,110 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `like_pattern::compile` turns an untrusted SQL `LIKE`/`ILIKE` +//! pattern into a matcher (a string automaton or a compiled regex). A user +//! controls the pattern, so a panic or pathological build here is a real +//! availability bug. We check two things: +//! +//! 1. Compiling an arbitrary pattern and matching arbitrary text never panics. +//! 2. A pattern built by escaping every wildcard/escape character in `text` is +//! a pure literal, and `LIKE` is anchored, so it must match exactly `text`. +//! +//! A pattern/text pair of random Unicode rarely contains the wildcard/escape +//! metacharacters or the literal overlap that drives the interesting code, so it +//! plateaus on shallow shapes. Instead we mostly draw both the pattern and the +//! match text from a tiny shared alphabet of literals plus the LIKE +//! metacharacters `%`, `_`, and `\`. Generating *multiple* `%` is deliberate: at +//! two-or-more `%` (`many_subpatterns > 1`) `compile` abandons the backtracking +//! string matcher and routes to the linear-time regex engine, a boundary the +//! string-only inputs never reach. Drawing the text from the same alphabet means +//! it actually matches the wildcards, exercising the `%`-suffix-search / +//! backtracking and the regex path on real (not always-empty) matches. + +#![no_main] + +use libfuzzer_sys::arbitrary::{self, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_expr::like_pattern; + +/// Literals shared between pattern and text, so wildcards match real characters. +const LITERALS: &[char] = &['a', 'b', 'c']; + +/// Builds a LIKE pattern over the literal alphabet plus the `%`/`_`/`\` +/// metacharacters, weighted toward emitting several `%` so the >1-`%` regex +/// routing boundary is crossed. +fn gen_pattern(u: &mut Unstructured) -> arbitrary::Result { + let mut p = String::new(); + let n = u.int_in_range(0usize..=16)?; + for _ in 0..n { + match u.int_in_range(0u8..=6)? { + // Weight `%` heavily (3/7) so multiple-`%` patterns are common. + 0 | 1 | 2 => p.push('%'), + 3 => p.push('_'), + 4 => { + // Escape sequence: `\` followed by a metachar or literal. A bare + // trailing `\` (the unterminated-escape arm) is intentionally + // reachable when this is the last iteration. + p.push('\\'); + if u.int_in_range(0u8..=3)? != 0 { + p.push(*u.choose(&['%', '_', '\\', 'a'])?); + } + } + _ => p.push(*u.choose(LITERALS)?), + } + } + Ok(p) +} + +/// Builds match text over the same literal alphabet. +fn gen_text(u: &mut Unstructured) -> arbitrary::Result { + let mut t = String::new(); + for _ in 0..u.int_in_range(0usize..=24)? { + t.push(*u.choose(LITERALS)?); + } + Ok(t) +} + +fn run(mut u: Unstructured) -> arbitrary::Result<()> { + let case_insensitive = u.arbitrary()?; + + // Pattern/text: usually from the shared alphabet, sometimes raw bytes so the + // compiler's reject/parse paths over arbitrary Unicode keep their coverage. + let (pattern, text) = if u.int_in_range(0u8..=3)? == 0 { + (u.arbitrary::()?, u.arbitrary::()?) + } else { + (gen_pattern(&mut u)?, gen_text(&mut u)?) + }; + + // (1) Arbitrary pattern + arbitrary text: compile and match must not panic. + if let Ok(matcher) = like_pattern::compile(&pattern, case_insensitive) { + let _ = matcher.is_match(&text); + } + + // (2) Escape every LIKE metacharacter in `text` to get a literal pattern, + // which must match exactly its own source text. + let mut literal = String::with_capacity(text.len() + 8); + for c in text.chars() { + if matches!(c, '%' | '_' | '\\') { + literal.push('\\'); + } + literal.push(c); + } + if let Ok(matcher) = like_pattern::compile(&literal, case_insensitive) { + assert!( + matcher.is_match(&text), + "literal LIKE pattern {literal:?} must match its source text {text:?}" + ); + } + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let _ = run(Unstructured::new(data)); +}); diff --git a/src/expr/fuzz/fuzz_targets/like_pattern_escape.rs b/src/expr/fuzz/fuzz_targets/like_pattern_escape.rs new file mode 100644 index 0000000000000..81ffb17964753 --- /dev/null +++ b/src/expr/fuzz/fuzz_targets/like_pattern_escape.rs @@ -0,0 +1,77 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `like_pattern::normalize_pattern` rewrites a `LIKE ... ESCAPE c` +//! pattern (custom escape character) into the default-escape form before +//! compilation. The user controls both the pattern and the escape character, so +//! the custom-escape rewrite, a char-by-char parser with escape state, must +//! not panic on any input, and its output must still compile and match. This is +//! the `ESCAPE`-clause path that `like_pattern_compile` (default escape only) +//! never exercises. +//! +//! With a fully arbitrary escape char and a fully arbitrary pattern, the escape +//! char almost never coincides with a character actually in the pattern, so the +//! two branches that matter stay cold: the custom-escape *consume* branch (where +//! the escape char is followed by another char) and the trailing *unterminated* +//! escape arm (where the escape char is the final character). To light both up +//! we draw the pattern over a tiny alphabet of LIKE metacharacters plus a couple +//! literals, and draw the escape char from that *same* alphabet, so it lands on +//! the pattern's own characters and the escape state machine runs in earnest, +//! including the off-by-one trailing-escape case. + +#![no_main] + +use libfuzzer_sys::arbitrary::{self, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_expr::like_pattern::{self, EscapeBehavior}; + +/// Shared alphabet for the pattern and the escape char, so the escape char +/// frequently matches characters in the pattern and the consume/unterminated +/// branches fire. Includes the LIKE metacharacters and a few literals. +const ALPHA: &[char] = &['%', '_', '\\', 'a', 'b', 'c']; + +fn gen_pattern(u: &mut Unstructured) -> arbitrary::Result { + let mut p = String::new(); + for _ in 0..u.int_in_range(0usize..=20)? { + p.push(*u.choose(ALPHA)?); + } + Ok(p) +} + +fn run(mut u: Unstructured) -> arbitrary::Result<()> { + let case_insensitive = u.arbitrary()?; + + let (pattern, escape_char) = if u.int_in_range(0u8..=3)? == 0 { + // Some fully-arbitrary inputs keep the raw-Unicode reject coverage. + (u.arbitrary::()?, u.arbitrary::()?) + } else { + (gen_pattern(&mut u)?, *u.choose(ALPHA)?) + }; + // Text from the same literal-ish alphabet so the compiled matcher can match. + let mut text = String::new(); + for _ in 0..u.int_in_range(0usize..=20)? { + text.push(*u.choose(ALPHA)?); + } + + for behavior in [EscapeBehavior::Char(escape_char), EscapeBehavior::Disabled] { + let Ok(normalized) = like_pattern::normalize_pattern(&pattern, behavior) else { + continue; + }; + // The rewritten pattern is in default-escape form and must compile and + // match arbitrary text without panicking. + if let Ok(matcher) = like_pattern::compile(&normalized, case_insensitive) { + let _ = matcher.is_match(&text); + } + } + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let _ = run(Unstructured::new(data)); +}); diff --git a/src/expr/fuzz/fuzz_targets/mfp_optimize.rs b/src/expr/fuzz/fuzz_targets/mfp_optimize.rs new file mode 100644 index 0000000000000..a939d9febb6ce --- /dev/null +++ b/src/expr/fuzz/fuzz_targets/mfp_optimize.rs @@ -0,0 +1,536 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `MapFilterProject::optimize` must preserve evaluation. +//! +//! A `MapFilterProject` (MFP) is the linear map/filter/project pipeline that +//! runs in essentially every dataflow operator. `optimize` fuses and reorders +//! maps and predicates, drops unused map expressions, and canonicalizes the +//! projection. A miscompile here silently corrupts query results, so it is a +//! high-value correctness target. +//! +//! We build a random, well-typed MFP over an `int4`/`int8`/`bool` input schema +//! (map expressions can reference earlier columns, and predicates and the +//! projection reference any column), with a scalar vocabulary that includes +//! `int4`/`int8` arithmetic and the `int4`↔`int8` casts (the latter is fallible, +//! so it feeds optimize's error-handling). The target runs in one of two modes: +//! +//! * **Non-temporal preservation.** Evaluate the original and the `optimize`d +//! clone on a batch of random input rows. The oracle is one-directional, +//! mirroring the contract optimize actually owes: optimize is allowed to +//! *drop* an error or a row that the original would reject, because it removes +//! unused map expressions and reorders predicates. But for every input row the +//! *original* passes through cleanly with output `out`, the optimized plan +//! must also pass it through with the byte-identical `out`. (When the original +//! errors or filters a row we assert nothing.) +//! +//! * **Temporal lowering.** Add predicates of the form `mz_now() e` (and +//! conjunctions of them) over a bounded `mz_timestamp` expression `e`, then +//! lower the MFP to a temporal `MfpPlan` (`into_plan`, which runs `optimize` +//! and `extract_temporal_bounds`, the operator/`StepMzTimestamp` translation +//! that the non-temporal path bails on). The plan defines a per-row validity +//! interval `[lower, upper)`. We read that interval off a single `evaluate` +//! and check, for a batch of concrete logical times `T`, that "the row is live +//! at `T`" (`lower <= T < upper`, with the non-temporal predicates also +//! passing) agrees with a substitution reference: the same MFP with every +//! `mz_now()` replaced by the literal `mz_timestamp` `T`, evaluated +//! non-temporally. The compared timestamps are kept well below `u64::MAX` so +//! `StepMzTimestamp` (the `+1` the lowering inserts for `<=`/`>`/`=`) never +//! overflows, which keeps the substitution equivalence exact. As above the +//! oracle is one-directional: if the reference errors at `T` we assert +//! nothing. + +#![no_main] + +use libfuzzer_sys::arbitrary::{self, Arbitrary, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_expr::{func, EvalError, MapFilterProject, MirScalarExpr, UnmaterializableFunc}; +use mz_repr::{Datum, Diff, ReprScalarType, Row, RowArena, Timestamp}; + +// Input schema: int4 columns, then int8 columns, then bool columns. +const N_IN_INT: usize = 2; +const N_IN_LONG: usize = 1; +const N_IN_BOOL: usize = 2; +const N_INPUT: usize = N_IN_INT + N_IN_LONG + N_IN_BOOL; +const MAX_MAPS: usize = 4; +const MAX_FILTERS: usize = 3; +const SCALAR_DEPTH: u32 = 4; +const ROWS_PER_MFP: usize = 8; +// Concrete logical times probed against the temporal plan's validity interval. +const TIMES_PER_MFP: usize = 6; + +#[derive(Clone, Copy, PartialEq)] +enum Ty { + Int, + Long, + Bool, +} + +fn scalar_ty(ty: Ty) -> ReprScalarType { + match ty { + Ty::Int => ReprScalarType::Int32, + Ty::Long => ReprScalarType::Int64, + Ty::Bool => ReprScalarType::Bool, + } +} + +fn input_types() -> Vec { + let mut v = vec![Ty::Int; N_IN_INT]; + v.extend(std::iter::repeat(Ty::Long).take(N_IN_LONG)); + v.extend(std::iter::repeat(Ty::Bool).take(N_IN_BOOL)); + v +} + +fn rand_ty(u: &mut Unstructured) -> arbitrary::Result { + Ok(match u.int_in_range(0u8..=2)? { + 0 => Ty::Int, + 1 => Ty::Long, + _ => Ty::Bool, + }) +} + +fn datum_of(u: &mut Unstructured, ty: Ty, nullable: bool) -> arbitrary::Result> { + if nullable && u.ratio(1u8, 4u8)? { + return Ok(Datum::Null); + } + Ok(match ty { + Ty::Int => Datum::Int32(i32::arbitrary(u)?), + Ty::Long => Datum::Int64(i64::arbitrary(u)?), + Ty::Bool => { + if bool::arbitrary(u)? { + Datum::True + } else { + Datum::False + } + } + }) +} + +/// Pick the index of an available column whose type is `want`, if any exists. +fn pick_col(u: &mut Unstructured, cols: &[Ty], want: Ty) -> arbitrary::Result> { + let matching: Vec = cols + .iter() + .enumerate() + .filter(|(_, t)| **t == want) + .map(|(i, _)| i) + .collect(); + if matching.is_empty() { + return Ok(None); + } + let i = u.int_in_range(0..=matching.len() - 1)?; + Ok(Some(matching[i])) +} + +fn gen_leaf(u: &mut Unstructured, want: Ty, cols: &[Ty]) -> arbitrary::Result { + let st = scalar_ty(want); + // Prefer a column reference when one of the right type is available. This is + // what makes optimize's unused-map / projection reasoning interesting. + if u.ratio(1u8, 2u8)? { + if let Some(col) = pick_col(u, cols, want)? { + return Ok(MirScalarExpr::column(col)); + } + } + Ok(match u.int_in_range(0u8..=1)? { + 0 => MirScalarExpr::literal_ok(datum_of(u, want, false)?, st), + _ => MirScalarExpr::literal_null(st), + }) +} + +fn gen_scalar( + u: &mut Unstructured, + want: Ty, + cols: &[Ty], + depth: u32, +) -> arbitrary::Result { + if depth == 0 || u.ratio(2u8, 5u8)? { + return gen_leaf(u, want, cols); + } + let d = depth - 1; + match want { + Ty::Int => match u.int_in_range(0u8..=5)? { + 0 => { + let cond = gen_scalar(u, Ty::Bool, cols, d)?; + let then = gen_scalar(u, Ty::Int, cols, d)?; + let els = gen_scalar(u, Ty::Int, cols, d)?; + Ok(cond.if_then_else(then, els)) + } + 1 => Ok(gen_scalar(u, Ty::Int, cols, d)? + .call_binary(gen_scalar(u, Ty::Int, cols, d)?, func::AddInt32)), + 2 => Ok(gen_scalar(u, Ty::Int, cols, d)? + .call_binary(gen_scalar(u, Ty::Int, cols, d)?, func::SubInt32)), + 3 => Ok(gen_scalar(u, Ty::Int, cols, d)? + .call_binary(gen_scalar(u, Ty::Int, cols, d)?, func::MulInt32)), + 4 => Ok(gen_scalar(u, Ty::Int, cols, d)? + .call_binary(gen_scalar(u, Ty::Int, cols, d)?, func::ModInt32)), + // int8 -> int4 (fallible: out-of-range overflows). + _ => Ok(gen_scalar(u, Ty::Long, cols, d)?.call_unary(func::CastInt64ToInt32)), + }, + Ty::Long => match u.int_in_range(0u8..=4)? { + 0 => { + let cond = gen_scalar(u, Ty::Bool, cols, d)?; + let then = gen_scalar(u, Ty::Long, cols, d)?; + let els = gen_scalar(u, Ty::Long, cols, d)?; + Ok(cond.if_then_else(then, els)) + } + 1 => Ok(gen_scalar(u, Ty::Long, cols, d)? + .call_binary(gen_scalar(u, Ty::Long, cols, d)?, func::AddInt64)), + 2 => Ok(gen_scalar(u, Ty::Long, cols, d)? + .call_binary(gen_scalar(u, Ty::Long, cols, d)?, func::SubInt64)), + 3 => Ok(gen_scalar(u, Ty::Long, cols, d)? + .call_binary(gen_scalar(u, Ty::Long, cols, d)?, func::MulInt64)), + // int4 -> int8 (infallible widening). + _ => Ok(gen_scalar(u, Ty::Int, cols, d)?.call_unary(func::CastInt32ToInt64)), + }, + Ty::Bool => match u.int_in_range(0u8..=5)? { + 0 => { + let cond = gen_scalar(u, Ty::Bool, cols, d)?; + let then = gen_scalar(u, Ty::Bool, cols, d)?; + let els = gen_scalar(u, Ty::Bool, cols, d)?; + Ok(cond.if_then_else(then, els)) + } + 1 => Ok(gen_scalar(u, Ty::Bool, cols, d)?.and(gen_scalar(u, Ty::Bool, cols, d)?)), + 2 => Ok(gen_scalar(u, Ty::Bool, cols, d)?.or(gen_scalar(u, Ty::Bool, cols, d)?)), + 3 => Ok(gen_scalar(u, Ty::Bool, cols, d)?.not()), + 4 => { + let t = rand_ty(u)?; + let a = gen_scalar(u, t, cols, d)?; + let b = gen_scalar(u, t, cols, d)?; + Ok(match u.int_in_range(0u8..=4)? { + 0 => a.call_binary(b, func::Eq), + 1 => a.call_binary(b, func::Lt), + 2 => a.call_binary(b, func::Gt), + 3 => a.call_binary(b, func::Lte), + _ => a.call_binary(b, func::Gte), + }) + } + _ => { + let t = rand_ty(u)?; + Ok(gen_scalar(u, t, cols, d)?.call_is_null()) + } + }, + } +} + +fn gen_input_row(u: &mut Unstructured, types: &[Ty]) -> arbitrary::Result { + let mut row = Row::default(); + let mut packer = row.packer(); + for ty in types { + packer.push(datum_of(u, *ty, true)?); + } + drop(packer); + Ok(row) +} + +/// `mz_now()`. +fn mz_now() -> MirScalarExpr { + MirScalarExpr::CallUnmaterializable(UnmaterializableFunc::MzNow) +} + +/// An `mz_timestamp` literal. +fn ts_literal(t: u64) -> MirScalarExpr { + MirScalarExpr::literal_ok( + Datum::MzTimestamp(Timestamp::new(t)), + ReprScalarType::MzTimestamp, + ) +} + +/// A bounded `mz_timestamp`-typed expression to compare `mz_now()` against. +/// Sometimes a bare literal, sometimes derived from an `int8` column via +/// `CastInt64ToMzTimestamp` so the bound depends on the row. Either way the +/// value is kept well below `u64::MAX` so the `StepMzTimestamp` (`+1`) that the +/// lowering inserts for `<=`/`>`/`=` cannot overflow, keeping the substitution +/// oracle exact. +fn gen_ts_expr(u: &mut Unstructured, cols: &[Ty]) -> arbitrary::Result { + if u.ratio(1u8, 2u8)? { + if let Some(col) = pick_col(u, cols, Ty::Long)? { + // The column holds an arbitrary i64. Clamp it into [0, BOUND) with a + // modulo so the cast can't go out of range or near the step overflow. + let masked = MirScalarExpr::column(col) + .call_binary(ts_bound_i64(), func::ModInt64) + .call_unary(func::AbsInt64) + .call_unary(func::CastInt64ToMzTimestamp); + return Ok(masked); + } + } + let t = u.int_in_range(0u64..=TS_BOUND)?; + Ok(ts_literal(t)) +} + +/// Upper bound (exclusive) on generated timestamp magnitudes, far from +/// `u64::MAX` so `StepMzTimestamp` never overflows. +const TS_BOUND: u64 = 1_000_000; + +fn ts_bound_i64() -> MirScalarExpr { + MirScalarExpr::literal_ok( + Datum::Int64(i64::try_from(TS_BOUND).unwrap()), + ReprScalarType::Int64, + ) +} + +/// A single temporal predicate `mz_now() e`. +fn gen_temporal_pred(u: &mut Unstructured, cols: &[Ty]) -> arbitrary::Result { + let e = gen_ts_expr(u, cols)?; + Ok(match u.int_in_range(0u8..=4)? { + 0 => mz_now().call_binary(e, func::Eq), + 1 => mz_now().call_binary(e, func::Lt), + 2 => mz_now().call_binary(e, func::Lte), + 3 => mz_now().call_binary(e, func::Gt), + _ => mz_now().call_binary(e, func::Gte), + }) +} + +/// Build the reference plan for a temporal MFP at a concrete logical time `t`: +/// take the original MFP, replace every `mz_now()` with the literal +/// `mz_timestamp` `t`, and lower it. After substitution there are no temporal +/// expressions, so the plan is non-temporal by construction. Returns `None` if +/// lowering rejects it (treated as "assert nothing at this `t`"). The result +/// depends only on `t`, not on any row, so callers build it once per `t`. +fn reference_plan(mfp: &MapFilterProject, t: u64) -> Option { + let mut substituted = mfp.clone(); + let subst = |e: &mut MirScalarExpr| { + e.visit_pre_mut(|node| { + if let MirScalarExpr::CallUnmaterializable(UnmaterializableFunc::MzNow) = node { + *node = ts_literal(t); + } + }); + }; + for expr in &mut substituted.expressions { + subst(expr); + } + for (_, pred) in &mut substituted.predicates { + subst(pred); + } + substituted.into_plan().ok()?.into_nontemporal().ok() +} + +/// Evaluate a reference plan on a row: `Some(pass)` for a clean pass/fail, `None` +/// on an evaluation error. +fn reference_present(plan: &mz_expr::SafeMfpPlan, row: &Row) -> Option { + let arena = RowArena::new(); + let mut datums: Vec = row.iter().collect(); + let mut buf = Row::default(); + match plan.evaluate_into(&mut datums, &arena, &mut buf) { + Ok(Some(_)) => Some(true), + Ok(None) => Some(false), + Err(_) => None, + } +} + +/// Read the validity interval a temporal plan assigns to `row`, evaluating once +/// with `time = 0`, `diff = +1`, and an always-valid frontier. Returns: +/// * `Err(())` if evaluation errored, +/// * `Ok(None)` if the (non-temporal part of the) plan rejected the row, +/// * `Ok(Some((lower, upper)))` for the half-open interval `[lower, upper)` +/// (`upper == None` means unbounded above). +type Interval = Option<(Timestamp, Option)>; +fn temporal_interval(plan: &mz_expr::MfpPlan, row: &Row) -> Result { + let arena = RowArena::new(); + let mut datums: Vec = row.iter().collect(); + let mut row_builder = Row::default(); + let mut lower: Option = None; + let mut upper: Option = None; + let mut errored = false; + let mut produced = false; + for result in plan.evaluate::( + &mut datums, + &arena, + Timestamp::new(0), + Diff::ONE, + |_t| true, + &mut row_builder, + ) { + produced = true; + match result { + Ok((_, t, diff)) => { + if diff == Diff::ONE { + lower = Some(t); + } else { + upper = Some(t); + } + } + Err(_) => errored = true, + } + } + if errored { + return Err(()); + } + if !produced { + return Ok(None); + } + Ok(Some((lower.unwrap_or_else(|| Timestamp::new(0)), upper))) +} + +/// Non-temporal preservation: the original optimize-vs-clone output equivalence. +fn run_nontemporal( + u: &mut Unstructured, + mfp: MapFilterProject, + types: &[Ty], +) -> arbitrary::Result<()> { + let mut optimized = mfp.clone(); + optimized.optimize(); + + let Ok(plan_orig) = mfp.into_plan() else { + return Ok(()); + }; + let Ok(safe_orig) = plan_orig.into_nontemporal() else { + return Ok(()); + }; + let Ok(plan_opt) = optimized.into_plan() else { + return Ok(()); + }; + let Ok(safe_opt) = plan_opt.into_nontemporal() else { + return Ok(()); + }; + + for _ in 0..ROWS_PER_MFP { + let row = gen_input_row(u, types)?; + let arena = RowArena::new(); + + let mut datums_o: Vec = row.iter().collect(); + let mut buf_o = Row::default(); + // Reduce the result to pass/fail/error, ending the borrow of `buf_o`. + let orig = match safe_orig.evaluate_into(&mut datums_o, &arena, &mut buf_o) { + Ok(Some(_)) => Some(true), + Ok(None) => Some(false), + Err(_) => None, + }; + + // Only a row the original passes through cleanly constrains the optimized + // plan. An error or a filtered row lets optimize legitimately differ. + if orig != Some(true) { + continue; + } + + let mut datums_p: Vec = row.iter().collect(); + let mut buf_p = Row::default(); + match safe_opt.evaluate_into(&mut datums_p, &arena, &mut buf_p) { + Ok(Some(out)) => assert_eq!( + &buf_o, out, + "optimize changed the projected output\n row = {row:?}\n out_orig = {buf_o:?}\n out_opt = {out:?}" + ), + Ok(None) => panic!("optimize filtered out a row the original passed\n row = {row:?}"), + Err(e) => panic!( + "optimize errored on a row the original passed cleanly\n row = {row:?}\n err = {e:?}" + ), + } + } + Ok(()) +} + +/// Temporal lowering: validity interval from the lowered plan vs. the +/// `mz_now()`-substitution reference. +fn run_temporal( + u: &mut Unstructured, + mfp: MapFilterProject, + types: &[Ty], +) -> arbitrary::Result<()> { + // `into_plan` runs `optimize` and `extract_temporal_bounds`. An Err means an + // unsupported temporal shape, which is a legitimate rejection, not a bug. + let Ok(plan) = mfp.clone().into_plan() else { + return Ok(()); + }; + + // Sample the logical times to probe, and build each substitution reference + // plan once (it depends only on `t`, not on the row). A reference plan that + // fails to lower is dropped: we simply assert nothing at that time. + let mut times: Vec<(u64, mz_expr::SafeMfpPlan)> = Vec::with_capacity(TIMES_PER_MFP); + for _ in 0..TIMES_PER_MFP { + let t = u.int_in_range(0u64..=TS_BOUND + 2)?; + if let Some(ref_plan) = reference_plan(&mfp, t) { + times.push((t, ref_plan)); + } + } + + for _ in 0..ROWS_PER_MFP { + let row = gen_input_row(u, types)?; + let interval = temporal_interval(&plan, &row); + // A plan that errors on this row lets the lowered plan legitimately + // differ from the substitution reference, so assert nothing. + let interval = match interval { + Err(()) => continue, + Ok(iv) => iv, + }; + + for (t, ref_plan) in × { + // One-directional: only constrain the plan when the reference is a + // clean pass/fail. A reference error lets the lowered plan differ. + let Some(present_ref) = reference_present(ref_plan, &row) else { + continue; + }; + let present_plan = match &interval { + // Plan rejects the row at every time (non-temporal predicate + // failed): it is absent at every `t`. + None => false, + Some((lower, upper)) => { + let tt = Timestamp::new(*t); + *lower <= tt && upper.map(|up| tt < up).unwrap_or(true) + } + }; + assert_eq!( + present_plan, present_ref, + "temporal lowering disagrees with substitution at t={t}\n interval = {interval:?}\n row = {row:?}\n mfp = {mfp:?}" + ); + } + } + Ok(()) +} + +fn run(u: &mut Unstructured) -> arbitrary::Result<()> { + let types = input_types(); + // Columns available to later expressions: input columns plus appended maps. + let mut cols = types.clone(); + + let temporal = u.ratio(1u8, 2u8)?; + + let n_maps = u.int_in_range(0..=MAX_MAPS)?; + let mut maps = Vec::with_capacity(n_maps); + for _ in 0..n_maps { + let t = rand_ty(u)?; + maps.push(gen_scalar(u, t, &cols, SCALAR_DEPTH)?); + cols.push(t); + } + + let n_filters = u.int_in_range(0..=MAX_FILTERS)?; + let mut filters = Vec::with_capacity(n_filters); + for _ in 0..n_filters { + // In temporal mode, mix in `mz_now()` predicates alongside ordinary ones. + if temporal && u.ratio(3u8, 5u8)? { + filters.push(gen_temporal_pred(u, &cols)?); + } else { + filters.push(gen_scalar(u, Ty::Bool, &cols, SCALAR_DEPTH)?); + } + } + // Guarantee at least one temporal predicate in temporal mode so the lowering + // path is actually exercised. + if temporal && !filters.iter().any(|f| f.contains_temporal()) { + filters.push(gen_temporal_pred(u, &cols)?); + } + + let n_proj = u.int_in_range(0..=cols.len())?; + let mut projection = Vec::with_capacity(n_proj); + for _ in 0..n_proj { + projection.push(u.int_in_range(0..=cols.len() - 1)?); + } + + let mfp = MapFilterProject::new(N_INPUT) + .map(maps) + .filter(filters) + .project(projection); + + if temporal { + run_temporal(u, mfp, &types) + } else { + run_nontemporal(u, mfp, &types) + } +} + +fuzz_target!(|data: &[u8]| { + let mut u = Unstructured::new(data); + let _ = run(&mut u); +}); diff --git a/src/expr/fuzz/fuzz_targets/mir_scalar_reduce.rs b/src/expr/fuzz/fuzz_targets/mir_scalar_reduce.rs new file mode 100644 index 0000000000000..2165e70d0a98e --- /dev/null +++ b/src/expr/fuzz/fuzz_targets/mir_scalar_reduce.rs @@ -0,0 +1,382 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `MirScalarExpr::reduce` must preserve successful evaluations. +//! The optimizer folds constants, simplifies `If`, and propagates nulls via +//! `reduce`, and downstream passes trust the reduced expression to evaluate to +//! the same value as the original. +//! +//! We generate a random, well-typed expression over a fixed `int4`/`int8`/ +//! `bool`/`text` schema, tracking the type of every subexpression during +//! generation so the result is well-typed by construction and `eval` can't +//! panic on a type mismatch. The function vocabulary spans the folder's +//! trickiest areas: integer arithmetic (`+`/`-`/`*`/`%`, negate, abs) on both +//! `int4` and `int8`, the boolean connectives `AND`/`OR`/`NOT`, comparisons and +//! `IS NULL` over every type, text concatenation/length, the `If` rewrites, and +//! a cast matrix (`int4`↔`int8`, `int`/`bool`→`text`, `text`→`int`, `int`→ +//! `bool`), many of which are *fallible* (overflow, parse failure, division by +//! zero), exercising reduce's error propagation. +//! +//! Crucially, alongside the binary `a AND b` / `a OR b` shapes, we also emit +//! *n-ary* `CallVariadic(And/Or, ..)` nodes with 3+ operands that nest other +//! n-ary And/Or directly inside (an `And` of `Or`s of `And`s, …). This is the +//! input shape that reaches reduce's variadic AND/OR machinery: +//! `flatten_associative` (collapsing same-func nests), `reduce_and_canonicalize` +//! (sort/dedup/short-circuit), `demorgans`, and especially `undistribute_and_or` +//! (factoring `(a&&b)||(a&&c)` into `a&&(b||c)`), which only fires on wide, +//! repeated-operand AND/OR trees that a purely binary generator essentially +//! never produces. +//! +//! We reduce a clone with the accurate column types and check evaluation on a +//! batch of random rows. The check is one-directional: reduce is allowed to +//! *eliminate* a runtime error (e.g. `If(c, x, x)` becomes `x`, dropping `c`, +//! and `x AND false` becomes `false`), so we only require that a successful +//! `Ok(v)` result is preserved exactly. Reduce must never alter a value or turn +//! a success into an error. Every value type compared is exact (no float/numeric +//! normalization), so equality is the right oracle. + +#![no_main] + +use libfuzzer_sys::arbitrary::{self, Arbitrary, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_expr::func::variadic::{And, Or}; +use mz_expr::{func, Eval, EvalError, MirScalarExpr}; +use mz_repr::{Datum, ReprColumnType, ReprScalarType, Row, RowArena}; + +// Column layout: a contiguous block per type. Columns are nullable. +const N_INT: usize = 2; // int4 [COL_INT0, COL_LONG0) +const N_LONG: usize = 1; // int8 [COL_LONG0, COL_BOOL0) +const N_BOOL: usize = 2; // bool [COL_BOOL0, COL_STR0) +const N_STR: usize = 1; // text [COL_STR0, N_COLS) +const COL_INT0: usize = 0; +const COL_LONG0: usize = COL_INT0 + N_INT; +const COL_BOOL0: usize = COL_LONG0 + N_LONG; +const COL_STR0: usize = COL_BOOL0 + N_BOOL; +const N_COLS: usize = COL_STR0 + N_STR; +const MAX_DEPTH: u32 = 6; +const ROWS_PER_EXPR: usize = 8; + +#[derive(Clone, Copy)] +enum Ty { + Int, + Long, + Bool, + Str, +} + +fn scalar_ty(ty: Ty) -> ReprScalarType { + match ty { + Ty::Int => ReprScalarType::Int32, + Ty::Long => ReprScalarType::Int64, + Ty::Bool => ReprScalarType::Bool, + Ty::Str => ReprScalarType::String, + } +} + +fn col_ty(col: usize) -> Ty { + if col < COL_LONG0 { + Ty::Int + } else if col < COL_BOOL0 { + Ty::Long + } else if col < COL_STR0 { + Ty::Bool + } else { + Ty::Str + } +} + +fn col_types() -> Vec { + (0..N_COLS) + .map(|c| scalar_ty(col_ty(c)).nullable(true)) + .collect() +} + +fn rand_ty(u: &mut Unstructured) -> arbitrary::Result { + Ok(match u.int_in_range(0u8..=3)? { + 0 => Ty::Int, + 1 => Ty::Long, + 2 => Ty::Bool, + _ => Ty::Str, + }) +} + +/// A short, bounded string drawn from an alphabet that sometimes parses as an +/// integer (so the `text`→`int` casts hit both the success and error paths). +fn gen_string(u: &mut Unstructured) -> arbitrary::Result { + const ALPHABET: &[u8] = b"01239-+ aZ"; + let n = u.int_in_range(0usize..=5)?; + let mut s = String::with_capacity(n); + for _ in 0..n { + let i = u.int_in_range(0usize..=ALPHABET.len() - 1)?; + s.push(ALPHABET[i] as char); + } + Ok(s) +} + +/// A `'static` datum for the non-text types (text needs borrowed backing +/// storage and is handled inline at each call site). +fn nonstr_datum(u: &mut Unstructured, ty: Ty) -> arbitrary::Result> { + Ok(match ty { + Ty::Int => Datum::Int32(i32::arbitrary(u)?), + Ty::Long => Datum::Int64(i64::arbitrary(u)?), + Ty::Bool => { + if bool::arbitrary(u)? { + Datum::True + } else { + Datum::False + } + } + Ty::Str => unreachable!("text datums are built from borrowed storage"), + }) +} + +fn gen_leaf(u: &mut Unstructured, ty: Ty) -> arbitrary::Result { + let st = scalar_ty(ty); + Ok(match u.int_in_range(0u8..=3)? { + 0 => { + let col = match ty { + Ty::Int => COL_INT0 + u.int_in_range(0..=N_INT - 1)?, + Ty::Long => COL_LONG0 + u.int_in_range(0..=N_LONG - 1)?, + Ty::Bool => COL_BOOL0 + u.int_in_range(0..=N_BOOL - 1)?, + Ty::Str => COL_STR0 + u.int_in_range(0..=N_STR - 1)?, + }; + MirScalarExpr::column(col) + } + 1 => match ty { + Ty::Str => { + let s = gen_string(u)?; + MirScalarExpr::literal_ok(Datum::String(&s), st) + } + _ => MirScalarExpr::literal_ok(nonstr_datum(u, ty)?, st), + }, + 2 => MirScalarExpr::literal_null(st), + // An error literal exercises reduce's error propagation/ordering. + _ => MirScalarExpr::literal(Err(EvalError::DivisionByZero), st), + }) +} + +/// Generate a small pool of bool "atoms", bounded subexpressions that the +/// wide-And/Or builder draws operands from *with repetition*. Reuse is the whole +/// point: `undistribute_and_or` only fires when the same operand appears across +/// several branches (e.g. `(a&&b)||(a&&c)`), and `reduce_and_canonicalize`'s +/// sort/dedup/short-circuit paths likewise need duplicates and unit/zero +/// literals to bite. A purely independent recursive generator almost never +/// repeats a subterm, so we seed the pool explicitly. +fn gen_bool_atoms(u: &mut Unstructured, depth: u32) -> arbitrary::Result> { + let n = u.int_in_range(2usize..=4)?; + let mut atoms = Vec::with_capacity(n + 2); + for _ in 0..n { + atoms.push(gen_expr(u, Ty::Bool, depth)?); + } + // Seed unit/zero literals so canonicalization's true/false handling fires. + atoms.push(MirScalarExpr::literal_ok(Datum::True, scalar_ty(Ty::Bool))); + atoms.push(MirScalarExpr::literal_ok(Datum::False, scalar_ty(Ty::Bool))); + Ok(atoms) +} + +/// Generate an n-ary `CallVariadic(And/Or, ..)` with 3+ operands, nesting other +/// n-ary And/Or directly inside so reduce's `flatten_associative` and +/// `undistribute_and_or` paths are reached. Operands are drawn (with repetition) +/// from a shared atom pool, which is what lets the undistribution and +/// dedup/short-circuit rewrites actually match. +fn gen_wide_and_or(u: &mut Unstructured, depth: u32) -> arbitrary::Result { + let outer_is_and = bool::arbitrary(u)?; + let atoms = gen_bool_atoms(u, depth.saturating_sub(1))?; + let pick = + |u: &mut Unstructured, atoms: &[MirScalarExpr]| -> arbitrary::Result { + let i = u.int_in_range(0..=atoms.len() - 1)?; + Ok(atoms[i].clone()) + }; + + let n_operands = u.int_in_range(3usize..=6)?; + let mut operands = Vec::with_capacity(n_operands); + for _ in 0..n_operands { + // With some probability nest an inner And/Or (often the *opposite* + // connective, the `(a&&b)||(a&&c)` distribution shape, sometimes the + // same one to exercise `flatten_associative`). + if depth > 0 && u.ratio(2u8, 5u8)? { + let inner_is_and = if u.ratio(3u8, 4u8)? { + !outer_is_and + } else { + outer_is_and + }; + let n_inner = u.int_in_range(2usize..=4)?; + let mut inner = Vec::with_capacity(n_inner); + for _ in 0..n_inner { + inner.push(pick(u, &atoms)?); + } + operands.push(if inner_is_and { + MirScalarExpr::call_variadic(And, inner) + } else { + MirScalarExpr::call_variadic(Or, inner) + }); + } else { + operands.push(pick(u, &atoms)?); + } + } + Ok(if outer_is_and { + MirScalarExpr::call_variadic(And, operands) + } else { + MirScalarExpr::call_variadic(Or, operands) + }) +} + +fn gen_expr(u: &mut Unstructured, ty: Ty, depth: u32) -> arbitrary::Result { + if depth == 0 || u.ratio(2u8, 5u8)? { + return gen_leaf(u, ty); + } + let d = depth - 1; + match ty { + Ty::Int => match u.int_in_range(0u8..=7)? { + // If { cond: bool, then: int, els: int } + 0 => { + let cond = gen_expr(u, Ty::Bool, d)?; + let then = gen_expr(u, Ty::Int, d)?; + let els = gen_expr(u, Ty::Int, d)?; + Ok(cond.if_then_else(then, els)) + } + // int4 arithmetic. `+`/`-`/`*`/`%` may overflow or divide by zero, + // producing an EvalError tolerated by the one-directional oracle. + 1 => Ok(gen_expr(u, Ty::Int, d)?.call_binary(gen_expr(u, Ty::Int, d)?, func::AddInt32)), + 2 => Ok(gen_expr(u, Ty::Int, d)?.call_binary(gen_expr(u, Ty::Int, d)?, func::SubInt32)), + 3 => Ok(gen_expr(u, Ty::Int, d)?.call_binary(gen_expr(u, Ty::Int, d)?, func::MulInt32)), + 4 => Ok(gen_expr(u, Ty::Int, d)?.call_binary(gen_expr(u, Ty::Int, d)?, func::ModInt32)), + 5 => Ok(gen_expr(u, Ty::Int, d)?.call_unary(func::NegInt32)), + 6 => Ok(gen_expr(u, Ty::Int, d)?.call_unary(func::AbsInt32)), + // Casts that produce an int4. + _ => match u.int_in_range(0u8..=2)? { + 0 => Ok(gen_expr(u, Ty::Long, d)?.call_unary(func::CastInt64ToInt32)), + 1 => Ok(gen_expr(u, Ty::Str, d)?.call_unary(func::CastStringToInt32)), + _ => Ok(gen_expr(u, Ty::Str, d)?.call_unary(func::ByteLengthString)), + }, + }, + Ty::Long => { + match u.int_in_range(0u8..=4)? { + 0 => { + let cond = gen_expr(u, Ty::Bool, d)?; + let then = gen_expr(u, Ty::Long, d)?; + let els = gen_expr(u, Ty::Long, d)?; + Ok(cond.if_then_else(then, els)) + } + 1 => Ok(gen_expr(u, Ty::Long, d)? + .call_binary(gen_expr(u, Ty::Long, d)?, func::AddInt64)), + 2 => Ok(gen_expr(u, Ty::Long, d)? + .call_binary(gen_expr(u, Ty::Long, d)?, func::SubInt64)), + 3 => Ok(gen_expr(u, Ty::Long, d)? + .call_binary(gen_expr(u, Ty::Long, d)?, func::MulInt64)), + // Casts that produce an int8. + _ => match u.int_in_range(0u8..=1)? { + 0 => Ok(gen_expr(u, Ty::Int, d)?.call_unary(func::CastInt32ToInt64)), + _ => Ok(gen_expr(u, Ty::Str, d)?.call_unary(func::CastStringToInt64)), + }, + } + } + Ty::Bool => match u.int_in_range(0u8..=7)? { + 0 => { + let cond = gen_expr(u, Ty::Bool, d)?; + let then = gen_expr(u, Ty::Bool, d)?; + let els = gen_expr(u, Ty::Bool, d)?; + Ok(cond.if_then_else(then, els)) + } + 1 => Ok(gen_expr(u, Ty::Bool, d)?.and(gen_expr(u, Ty::Bool, d)?)), + 2 => Ok(gen_expr(u, Ty::Bool, d)?.or(gen_expr(u, Ty::Bool, d)?)), + 3 => Ok(gen_expr(u, Ty::Bool, d)?.not()), + // A wide, nested n-ary And/Or with shared operands, the shape that + // reaches flatten_associative / undistribute_and_or. + 7 => gen_wide_and_or(u, d), + // A comparison of two operands of a common type yields a bool. + 4 => { + let t = rand_ty(u)?; + let a = gen_expr(u, t, d)?; + let b = gen_expr(u, t, d)?; + Ok(match u.int_in_range(0u8..=4)? { + 0 => a.call_binary(b, func::Eq), + 1 => a.call_binary(b, func::Lt), + 2 => a.call_binary(b, func::Gt), + 3 => a.call_binary(b, func::Lte), + _ => a.call_binary(b, func::Gte), + }) + } + // IS NULL of any-typed operand yields a bool. + 5 => { + let t = rand_ty(u)?; + Ok(gen_expr(u, t, d)?.call_is_null()) + } + // int4 -> bool cast. + _ => Ok(gen_expr(u, Ty::Int, d)?.call_unary(func::CastInt32ToBool)), + }, + Ty::Str => match u.int_in_range(0u8..=3)? { + 0 => { + let cond = gen_expr(u, Ty::Bool, d)?; + let then = gen_expr(u, Ty::Str, d)?; + let els = gen_expr(u, Ty::Str, d)?; + Ok(cond.if_then_else(then, els)) + } + 1 => Ok(gen_expr(u, Ty::Str, d)? + .call_binary(gen_expr(u, Ty::Str, d)?, func::TextConcatBinary)), + 2 => Ok(gen_expr(u, Ty::Int, d)?.call_unary(func::CastInt32ToString)), + _ => Ok(gen_expr(u, Ty::Bool, d)?.call_unary(func::CastBoolToString)), + }, + } +} + +fn gen_row(u: &mut Unstructured) -> arbitrary::Result { + let mut row = Row::default(); + let mut packer = row.packer(); + for c in 0..N_COLS { + if u.ratio(1u8, 4u8)? { + packer.push(Datum::Null); + continue; + } + match col_ty(c) { + Ty::Str => { + let s = gen_string(u)?; + packer.push(Datum::String(&s)); + } + ty => packer.push(nonstr_datum(u, ty)?), + } + } + drop(packer); + Ok(row) +} + +fn run(u: &mut Unstructured) -> arbitrary::Result<()> { + let types = col_types(); + let top_ty = rand_ty(u)?; + let expr = gen_expr(u, top_ty, MAX_DEPTH)?; + + let mut reduced = expr.clone(); + reduced.reduce(&types); + + for _ in 0..ROWS_PER_EXPR { + let row = gen_row(u)?; + let datums: Vec = row.iter().collect(); + let arena = RowArena::new(); + let original = expr.eval(&datums, &arena); + let folded = reduced.eval(&datums, &arena); + // The invariant is one-directional. `reduce` is *permitted* to eliminate + // a runtime error: e.g. `reduce_if` collapses `If(c, x, x)` to `x`, which + // drops `c` (and any error it would raise). So an `Err` original may be + // turned into anything. But it must never change a *successful* value, and + // must never introduce an error where evaluation succeeded: when the + // original is `Ok(v)`, the reduced expression must also be exactly `Ok(v)`. + if original.is_ok() { + assert_eq!( + original, folded, + "reduce changed a successful evaluation\n expr = {expr:?}\n reduced = {reduced:?}\n row = {row:?}" + ); + } + } + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let mut u = Unstructured::new(data); + let _ = run(&mut u); +}); diff --git a/src/expr/fuzz/fuzz_targets/timezone_convert.rs b/src/expr/fuzz/fuzz_targets/timezone_convert.rs new file mode 100644 index 0000000000000..80f7651d63f7e --- /dev/null +++ b/src/expr/fuzz/fuzz_targets/timezone_convert.rs @@ -0,0 +1,96 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: the `AT TIME ZONE` conversion math. Parsing an untrusted +//! timezone and applying it to a timestamp does non-trivial offset arithmetic +//! (DST transition lookups, leap-second-aware add/sub, Duration math), all of +//! which must return an `EvalError` rather than panic on any input. We exercise +//! both directions: TIMESTAMP -> TIMESTAMPTZ and TIMESTAMPTZ -> TIMESTAMP. +//! +//! An arbitrary `&str` almost never parses to a `Timezone`, and when it does +//! it's overwhelmingly a trivial `FixedOffset`, so the interesting code (the +//! named-zone DST-transition lookup, ambiguous/nonexistent local times) would +//! barely run. So most of the time we pick a real IANA zone *with DST* (and a +//! few fixed offsets) so the transition math actually executes. A minority arm +//! still feeds an arbitrary string to keep the parser's reject paths covered. + +#![no_main] + +use chrono::DateTime; +use libfuzzer_sys::arbitrary::{self, Arbitrary, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_expr::{Eval, MirScalarExpr, UnaryFunc, func}; +use mz_pgtz::timezone::{Timezone, TimezoneSpec}; +use mz_repr::adt::timestamp::CheckedTimestamp; +use mz_repr::{Datum, ReprScalarType, RowArena}; + +/// Real zones whose offsets shift (DST / sub-hour / historical), plus a couple +/// of fixed offsets, the inputs that actually exercise the conversion math. +const ZONES: &[&str] = &[ + "America/New_York", + "America/Los_Angeles", + "Europe/London", + "Europe/Berlin", + "Europe/Lisbon", + "Australia/Lord_Howe", // 30-minute DST shift + "Pacific/Chatham", // :45 offset with DST + "Antarctica/Troll", // 2-hour DST jump + "Asia/Kolkata", // :30, no DST + "America/Sao_Paulo", + "UTC", + "+05:30", + "-08", +]; + +fn run(u: &mut Unstructured) -> arbitrary::Result<()> { + // 3-in-4: a real (mostly DST-bearing) zone, otherwise an arbitrary string. + let tz = if u.int_in_range(0u8..=3)? != 0 { + let Ok(tz) = Timezone::parse(u.choose(ZONES)?, TimezoneSpec::Iso) else { + return Ok(()); + }; + tz + } else { + let tz_str = <&str>::arbitrary(u)?; + let spec = if bool::arbitrary(u)? { + TimezoneSpec::Iso + } else { + TimezoneSpec::Posix + }; + let Ok(tz) = Timezone::parse(tz_str, spec) else { + return Ok(()); + }; + tz + }; + let secs = u.int_in_range(-8_000_000_000_000i64..=8_000_000_000_000)?; + let nanos = u.int_in_range(0u32..=999_999_999)?; + let Some(dt) = DateTime::from_timestamp(secs, nanos) else { + return Ok(()); + }; + let arena = RowArena::new(); + + // TIMESTAMP `AT TIME ZONE tz` -> TIMESTAMPTZ. + if let Ok(ts) = CheckedTimestamp::from_timestamplike(dt.naive_utc()) { + let expr = MirScalarExpr::literal_ok(Datum::Timestamp(ts), ReprScalarType::Timestamp) + .call_unary(UnaryFunc::TimezoneTimestamp(func::TimezoneTimestamp(tz))); + let _ = expr.eval(&[], &arena); + } + + // TIMESTAMPTZ `AT TIME ZONE tz` -> TIMESTAMP. + if let Ok(tstz) = CheckedTimestamp::from_timestamplike(dt) { + let expr = MirScalarExpr::literal_ok(Datum::TimestampTz(tstz), ReprScalarType::TimestampTz) + .call_unary(UnaryFunc::TimezoneTimestampTz(func::TimezoneTimestampTz(tz))); + let _ = expr.eval(&[], &arena); + } + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let mut u = Unstructured::new(data); + let _ = run(&mut u); +}); diff --git a/src/expr/src/scalar.rs b/src/expr/src/scalar.rs index 39d2de76d8ff9..1ef06bf5e0745 100644 --- a/src/expr/src/scalar.rs +++ b/src/expr/src/scalar.rs @@ -849,6 +849,49 @@ impl MirScalarExpr { } } + /// Whether [`Self::undistribute_and_or`] can rewrite `self` without changing + /// its error semantics. + /// + /// Undistribution recombines the operands *not* common to every disjunct into + /// a new AND/OR, changing the short-circuit context they evaluate in. Only a + /// dominating operand (`false`/`true`), not a NULL one, absorbs another + /// operand's error, so recombining a could-error operand can change whether a + /// row errors (e.g. `a OR (a AND )` raises for a NULL `a`, but + /// undistributes to `a`). Operands common to every disjunct are factored out + /// unchanged and stay sound, which is what lets a shared temporal filter like + /// `mz_now() < ` (whose cast can error) be factored out, as the renderer + /// requires. + fn undistribution_preserves_errors(&self) -> bool { + let MirScalarExpr::CallVariadic { + func: func @ (VariadicFunc::And(_) | VariadicFunc::Or(_)), + exprs, + } = self + else { + return true; + }; + // Fast path for the common case: with no erroring operand there is + // nothing to preserve, so skip the per-operand bookkeeping below. + // `could_error` recurses, so this also covers nested operands. + if !exprs.iter().any(|o| o.could_error()) { + return true; + } + let inner = func.switch_and_or(); + // The operands of a disjunct (a non-`inner` disjunct is its own singleton). + let operands = |o: &MirScalarExpr| match o { + MirScalarExpr::CallVariadic { func, exprs } if *func == inner => exprs.clone(), + _ => vec![o.clone()], + }; + let common = exprs + .iter() + .map(|o| BTreeSet::from_iter(operands(o))) + .reduce(|a, b| &a & &b) + .unwrap_or_default(); + exprs + .iter() + .flat_map(operands) + .all(|op| !op.could_error() || common.contains(&op)) + } + /// AND/OR undistribution (factoring out) to apply at each `MirScalarExpr`. /// /// This method attempts to apply one of the [distribution laws][distributivity] @@ -922,6 +965,11 @@ impl MirScalarExpr { while old_self != *self { old_self = self.clone(); self.reduce_and_canonicalize_and_or(); // We don't want to deal with 1-arg AND/OR at the top + + if !self.undistribution_preserves_errors() { + return; + } + if let MirScalarExpr::CallVariadic { exprs: outer_operands, func: outer_func @ (VariadicFunc::Or(_) | VariadicFunc::And(_)), @@ -2475,6 +2523,78 @@ mod tests { use super::*; use crate::scalar::func::variadic::Coalesce; + #[mz_ore::test] + fn test_reduce_and_or_preserves_operand_errors() { + // #37049: AND/OR are non-strict, so a `false`/`true` operand absorbs an + // erroring operand at runtime. `reduce` must not fold or absorb an + // erroring operand into an error the original never raises. + let bool_typ = ReprScalarType::Bool; + let types = vec![bool_typ.clone().nullable(true)]; + let err = || MirScalarExpr::literal(Err(EvalError::DivisionByZero), bool_typ.clone()); + let col = || MirScalarExpr::column(0); + let arena = RowArena::new(); + + // `x AND ` must not fold to the error: `false AND ` is `false`. + let mut and = col().and(err()); + and.reduce(&types); + assert!(!and.is_literal_err(), "{and:?}"); + assert_eq!(and.eval(&[Datum::False], &arena), Ok(Datum::False)); + + // `a OR (a AND )` must not absorb to `a`: a NULL `a` surfaces the + // error (`NULL OR (NULL AND )` = ``). + let mut or = col().or(col().and(err())); + or.reduce(&types); + assert_ne!(or, col(), "{or:?}"); + assert_eq!(or.eval(&[Datum::True], &arena), Ok(Datum::True)); + assert_eq!(or.eval(&[Datum::False], &arena), Ok(Datum::False)); + assert_eq!( + or.eval(&[Datum::Null], &arena), + Err(EvalError::DivisionByZero) + ); + } + + #[mz_ore::test] + fn test_reduce_and_or_undistributes_common_error() { + // CLU-137: undistribution must still factor out a could-error operand + // common to every disjunct (e.g. a temporal `mz_now() < `), so it + // becomes a standalone conjunct the renderer can extract. The reverted + // #37049 guard skipped undistribution for any could-error expression, + // burying such predicates inside the OR. + let bool_typ = ReprScalarType::Bool; + let types = vec![ + bool_typ.clone().nullable(true), + bool_typ.clone().nullable(true), + ]; + let err = || MirScalarExpr::literal(Err(EvalError::DivisionByZero), bool_typ.clone()); + let arena = RowArena::new(); + + // `(a AND ) OR (b AND )` --> ` AND (a OR b)`. + let original = MirScalarExpr::column(0) + .and(err()) + .or(MirScalarExpr::column(1).and(err())); + let mut reduced = original.clone(); + reduced.reduce(&types); + assert!( + matches!( + &reduced, + MirScalarExpr::CallVariadic { + func: VariadicFunc::And(_), + .. + } + ), + "common error operand not factored out: {reduced:?}" + ); + // ...and error semantics are preserved over every assignment. + for a in [Datum::True, Datum::False, Datum::Null] { + for b in [Datum::True, Datum::False, Datum::Null] { + assert_eq!( + reduced.eval(&[a, b], &arena), + original.eval(&[a, b], &arena) + ); + } + } + } + #[mz_ore::test] #[cfg_attr(miri, ignore)] // error: unsupported operation: can't call foreign function `rust_psm_stack_pointer` on OS `linux` fn test_reduce() { diff --git a/src/expr/src/scalar/reduce/variadic.rs b/src/expr/src/scalar/reduce/variadic.rs index e6cfbc2e4dec2..8dee7e99f6c37 100644 --- a/src/expr/src/scalar/reduce/variadic.rs +++ b/src/expr/src/scalar/reduce/variadic.rs @@ -51,9 +51,22 @@ pub(super) fn reduce_call_variadic( *e = MirScalarExpr::literal_null(e.typ(column_types).scalar_type); return; } - if let Some(err) = exprs.iter().find_map(|x| x.as_literal_err()) { - *e = MirScalarExpr::literal(Err(err.clone()), e.typ(column_types).scalar_type); - return; + // Fold the call to an operand's literal error only when the function is + // strict in errors: any operand error must make the whole call error. The + // non-strict variadics are excluded, because they can absorb an operand's + // error at runtime: AND/OR via a dominating `false`/`true` operand + // (`false AND ` is `false`), and ErrorIfNull because it evaluates its + // message argument only when the first argument is null. Every other variadic + // here evaluates all of its operands, so the fold is valid. (Coalesce, also + // non-strict, bailed out above.) + if !matches!( + func, + VariadicFunc::And(_) | VariadicFunc::Or(_) | VariadicFunc::ErrorIfNull(_) + ) { + if let Some(err) = exprs.iter().find_map(|x| x.as_literal_err()) { + *e = MirScalarExpr::literal(Err(err.clone()), e.typ(column_types).scalar_type); + return; + } } // Per-function dispatch. Arms are mutually exclusive on discriminant; the diff --git a/src/interchange/fuzz/.gitignore b/src/interchange/fuzz/.gitignore new file mode 100644 index 0000000000000..ab0eaa1a49031 --- /dev/null +++ b/src/interchange/fuzz/.gitignore @@ -0,0 +1,5 @@ +target/ +corpus/ +artifacts/ +coverage/ +Cargo.lock diff --git a/src/interchange/fuzz/Cargo.toml b/src/interchange/fuzz/Cargo.toml new file mode 100644 index 0000000000000..579d2759a03ec --- /dev/null +++ b/src/interchange/fuzz/Cargo.toml @@ -0,0 +1,48 @@ +# Fuzz crate for mz-interchange: decode untrusted source bytes (Kafka message +# payloads, webhook bodies) into Materialize `Row`s. These decoders are the +# first thing to touch external data, so a panic reachable from the wire is a +# real availability bug for source ingestion. +# +# Excluded from the main workspace because libFuzzer requires nightly Rust. + +[package] +workspace = "../../../test/cargo-fuzz" +name = "mz-interchange-fuzz" +version = "0.0.0" +publish = false +edition = "2021" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +chrono = "0.4" +libfuzzer-sys = "0.4" +mz-interchange = { path = ".." } +mz-repr = { path = "../../repr" } +prost = "0.14.3" +prost-types = "0.14.3" +serde_json = "1" +tokio = { version = "1", features = ["rt"] } +uuid = "1" + +[[bin]] +name = "avro_decode_fuzzed_schema" +path = "fuzz_targets/avro_decode_fuzzed_schema.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "json_encode" +path = "fuzz_targets/json_encode.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "protobuf_decode_fuzzed_schema" +path = "fuzz_targets/protobuf_decode_fuzzed_schema.rs" +test = false +doc = false +bench = false diff --git a/src/interchange/fuzz/fuzz_targets/avro_decode_fuzzed_schema.rs b/src/interchange/fuzz/fuzz_targets/avro_decode_fuzzed_schema.rs new file mode 100644 index 0000000000000..3198530edc91b --- /dev/null +++ b/src/interchange/fuzz/fuzz_targets/avro_decode_fuzzed_schema.rs @@ -0,0 +1,812 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `mz_interchange::avro::Decoder` decoding an untrusted Avro +//! message body against a *fuzzer-generated* reader schema instead of one fixed +//! schema. The decoder's behaviour is schema-directed, so a fixed schema only +//! ever walks one set of `AvroFlatDecoder` paths. The bug-prone paths are the +//! schema-dependent ones that a single schema barely touches: `decimal` +//! (unscaled two's-complement +//! bytes -> `Numeric`, where precision/scale and byte length interact), `fixed` +//! (size-N byte runs), `enum` (symbol index -> string, an out-of-range index is +//! attacker-controlled), the logical date/timestamp conversions, and deeply +//! nested records/arrays/maps and unions. +//! +//! Critically, we don't just generate the schema and feed *random bytes* as the +//! body: a strict binary decoder rejects almost all random bytes at the first +//! field, so a random body never reaches the deep decode logic. Instead we +//! generate a structured type (`Ty`), serialize it to the reader-schema JSON, +//! and *Avro-binary-encode a random value against that same type*, so the body +//! is valid by construction and the decoder walks all the way through. Coverage +//! guidance then learns which byte streams produce which shapes. We don't lose +//! the error-path coverage a random body gave, though: a quarter of the inputs +//! feed the raw remaining bytes, and others truncate or single-byte-corrupt the +//! valid encoding. Either way, an accepted schema must never panic. +//! +//! Beyond "never panic", we add one error oracle. Most bodies *should* be +//! rejected (random bytes, truncations, an out-of-range `enum` index, an +//! over-precision `decimal`, invalid `json`), so a decode `Err` is usually the +//! correct outcome and is discarded. But when the body is a *clean* encoding of +//! a type tree that is guaranteed decodable for any value the encoder emits +//! (plain scalars, `fixed`, and structural composites over them, see +//! `decode_infallible`), the decoder is round-tripping bytes it *must* accept, +//! so there a decode error is a real bug and we assert success. A panic-only +//! oracle never notices a "valid input wrongly rejected" regression (cf. +//! #37087's deferred union-promotion error). +//! +//! A fraction of inputs instead drive a round-trip *correctness* oracle +//! (`run_roundtrip`): a decode that succeeds but yields the *wrong* datum slips +//! past both the panic and the decode-success oracles. So we build a record of +//! plain scalars (bool/int/long/string/bytes and their `["null", T]` form) whose +//! decoded `Datum` is exactly determined, encode it, decode it, and assert the +//! decoded `Row` equals the values we put in. +//! +//! Rather than freeze the schema-dependent knobs at one value each, we vary the +//! ones that drive distinct decode arithmetic: `decimal` precision (1..=39, the +//! `NUMERIC_DATUM_MAX_PRECISION` boundary), scale (0..=precision, where +//! `parse_decimal` rejects scale > precision and `twos_complement_be_to_numeric` +//! interprets it), and the backing `fixed` size, so the two's-complement byte +//! run and the precision/scale interaction are not pinned. The decimal +//! *value* bytes are likewise biased (see `push_twos_complement` / +//! `gen_decimal_len`) toward the patterns that stress that arithmetic rather +//! than only uniform-random runs: the empty run (== 0), all-`0x00`/`0xFF` sign +//! extension, the `0x80`/`0x7F` magnitude extremes, and lengths bracketing the +//! narrow/wide split (17) and the wide/overflow regime. We also emit a +//! `json` logical field (a `string` tagged `connect.name:io.debezium.data.Json`) +//! whose body is real JSON text, reaching the `AvroFlatDecoder::json` -> +//! `JsonbPacker` path that a plain string never touches. And multi-variant +//! *essential* unions like `["int","string"]`, accepted only as a record field +//! (each non-null variant expands to its own nullable column) and rejected +//! elsewhere, exercising `get_union_columns`' field-invention/expansion logic +//! that the `["null", T]` nullability pattern alone never reaches. + +#![no_main] + +use std::sync::OnceLock; + +use libfuzzer_sys::arbitrary::{self, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_interchange::avro::{Decoder, WriterSchemaProvider}; +use mz_repr::{Datum, Row}; + +fn rt() -> &'static tokio::runtime::Runtime { + static RT: OnceLock = OnceLock::new(); + RT.get_or_init(|| { + tokio::runtime::Builder::new_current_thread() + .build() + .expect("current-thread runtime") + }) +} + +/// A generated Avro type. We keep it structured (rather than going straight to +/// JSON) so that the *body* encoder can walk the exact same type. An array +/// needs one item type in the schema but N encoded values in the body, so the +/// schema and the body can't share a single recursive pass. +enum Ty { + Bool, + Int, + Long, + Float, + Double, + String, + Bytes, + /// `decimal` logical type over `bytes`: (precision, scale). The unscaled + /// value is length-prefixed, so the byte-run length is unbounded. + DecimalBytes(u32, u32), + /// `decimal` logical type over a `fixed`: (unique-name counter, fixed size, + /// precision, scale). The wire body is exactly `size` bytes. + DecimalFixed(u32, u32, u32, u32), + Date, + TimestampMillis, + TimestampMicros, + Uuid, + /// `string` tagged with `connect.name = io.debezium.data.Json`, which the + /// avro schema parser maps to `SchemaPiece::Json` (-> SQL `Jsonb`). The wire + /// body is a length-prefixed run of valid JSON text. + Json, + /// Plain `fixed` byte run: (unique-name counter, size). + Fixed(u32, u32), + /// `enum` with symbols A/B/C (unique-name counter). + Enum(u32), + /// `record` (unique-name counter, fields). Each field is itself a `Ty`, + /// except that a field may be an `EssentialUnion`, which is only valid in + /// this position. + Record(u32, Vec), + Array(Box), + Map(Box), + /// `["null", T]`, the nullability-pattern union, valid anywhere a single + /// column is expected (record field, array item, map value). + Nullable(Box), + /// A multi-variant *essential* union of non-null variants, optionally with a + /// leading `null`: e.g. `["int","string"]` or `["null","int","string"]`. + /// `validate_schema_2` rejects this everywhere except as a record field, + /// where `get_union_columns` expands it to one nullable column per non-null + /// variant. Stored as (has_null, variants). Generated only as a record field. + EssentialUnion(bool, Vec), +} + +/// Generate a `decimal`'s precision/scale. `parse_decimal` requires +/// `0 <= scale <= precision` and the SQL validator caps precision at +/// `NUMERIC_DATUM_MAX_PRECISION` (39). Pick within those bounds so the schema is +/// accepted and we vary the decode arithmetic across the whole legal range. +fn gen_decimal_params(u: &mut Unstructured) -> arbitrary::Result<(u32, u32)> { + let precision = u.int_in_range(1u32..=39)?; + let scale = u.int_in_range(0u32..=precision)?; + Ok((precision, scale)) +} + +/// Generate one syntactically valid Avro type. `counter` keeps named types +/// (record/enum/fixed) unique within the schema, since duplicate names make Avro +/// schema parsing fail, wasting the whole input. This never returns an +/// `EssentialUnion` (only valid as a record field, see `gen_field`). +fn gen_ty(u: &mut Unstructured, counter: &mut u32, depth: u32) -> arbitrary::Result { + // Without remaining input (or at max depth) fall back to a primitive so + // generation always terminates with a valid type. + let choice = if depth == 0 || u.is_empty() { + u.int_in_range(0u8..=9)? + } else { + u.int_in_range(0u8..=13)? + }; + Ok(match choice { + 0 => match u.int_in_range(0u8..=6)? { + 0 => Ty::Bool, + 1 => Ty::Int, + 2 => Ty::Long, + 3 => Ty::Float, + 4 => Ty::Double, + 5 => Ty::String, + _ => Ty::Bytes, + }, + 1 => { + let (p, s) = gen_decimal_params(u)?; + Ty::DecimalBytes(p, s) + } + 2 => { + *counter += 1; + let (p, s) = gen_decimal_params(u)?; + // `fixed` requires a positive size. Allow runs both shorter and + // longer than the canonical 16/24 to vary the two's-complement path. + let size = u.int_in_range(1u32..=40)?; + Ty::DecimalFixed(*counter, size, p, s) + } + 3 => Ty::Date, + 4 => Ty::TimestampMillis, + 5 => Ty::TimestampMicros, + 6 => Ty::Uuid, + 7 => { + *counter += 1; + let size = u.int_in_range(0u32..=24)?; + Ty::Fixed(*counter, size) + } + 8 => { + *counter += 1; + Ty::Enum(*counter) + } + 9 => Ty::Json, + 10 => { + *counter += 1; + let name = *counter; + let n = u.int_in_range(0u8..=3)?; + let mut fields = Vec::with_capacity(n.into()); + for _ in 0..n { + fields.push(gen_field(u, counter, depth - 1)?); + } + Ty::Record(name, fields) + } + 11 => Ty::Array(Box::new(gen_ty(u, counter, depth - 1)?)), + 12 => Ty::Map(Box::new(gen_ty(u, counter, depth - 1)?)), + _ => Ty::Nullable(Box::new(gen_ty(u, counter, depth - 1)?)), + }) +} + +/// Generate a single record field. Usually a plain `gen_ty`, but occasionally a +/// multi-variant essential union, valid only here. Variants are drawn from a +/// set of *distinct* Avro type kinds (Avro rejects a union with two branches of +/// the same unnamed type), each a single-column `validate_schema_2` type. +fn gen_field(u: &mut Unstructured, counter: &mut u32, depth: u32) -> arbitrary::Result { + if depth == 0 || u.is_empty() || u.int_in_range(0u8..=4)? != 0 { + return gen_ty(u, counter, depth); + } + // Pool of distinct, single-column variant kinds (no duplicate Avro kind). + let pool: &[fn() -> Ty] = &[ + || Ty::Bool, + || Ty::Int, + || Ty::Long, + || Ty::Float, + || Ty::Double, + || Ty::String, + || Ty::Bytes, + ]; + let nvariants = u.int_in_range(2usize..=pool.len())?; + // Choose `nvariants` distinct indices into the pool, preserving order. + let mut chosen = Vec::with_capacity(nvariants); + let mut idx = 0usize; + let mut remaining = nvariants; + while remaining > 0 && idx < pool.len() { + let left = pool.len() - idx; + // Pick this index iff we still need as many as are left, or by coin flip. + if remaining == left || u.int_in_range(0u8..=1)? == 1 { + chosen.push(pool[idx]()); + remaining -= 1; + } + idx += 1; + } + let has_null = u.int_in_range(0u8..=1)? == 1; + Ok(Ty::EssentialUnion(has_null, chosen)) +} + +/// Serialize a `Ty` to its reader-schema JSON fragment. +fn ty_to_json(ty: &Ty, out: &mut String) { + match ty { + Ty::Bool => out.push_str("\"boolean\""), + Ty::Int => out.push_str("\"int\""), + Ty::Long => out.push_str("\"long\""), + Ty::Float => out.push_str("\"float\""), + Ty::Double => out.push_str("\"double\""), + Ty::String => out.push_str("\"string\""), + Ty::Bytes => out.push_str("\"bytes\""), + Ty::DecimalBytes(p, s) => out.push_str(&format!( + "{{\"type\":\"bytes\",\"logicalType\":\"decimal\",\"precision\":{p},\"scale\":{s}}}" + )), + Ty::DecimalFixed(n, size, p, s) => out.push_str(&format!( + "{{\"type\":\"fixed\",\"name\":\"D{n}\",\"size\":{size},\"logicalType\":\"decimal\",\"precision\":{p},\"scale\":{s}}}" + )), + Ty::Date => out.push_str("{\"type\":\"int\",\"logicalType\":\"date\"}"), + Ty::TimestampMillis => { + out.push_str("{\"type\":\"long\",\"logicalType\":\"timestamp-millis\"}") + } + Ty::TimestampMicros => { + out.push_str("{\"type\":\"long\",\"logicalType\":\"timestamp-micros\"}") + } + Ty::Uuid => out.push_str("{\"type\":\"string\",\"logicalType\":\"uuid\"}"), + // The avro parser maps a `string` with this connect.name to + // `SchemaPiece::Json`. + Ty::Json => out.push_str( + "{\"type\":\"string\",\"connect.name\":\"io.debezium.data.Json\"}", + ), + Ty::Fixed(n, size) => { + out.push_str(&format!("{{\"type\":\"fixed\",\"name\":\"F{n}\",\"size\":{size}}}")) + } + Ty::Enum(n) => out.push_str(&format!( + "{{\"type\":\"enum\",\"name\":\"E{n}\",\"symbols\":[\"A\",\"B\",\"C\"]}}" + )), + Ty::Record(n, fields) => { + out.push_str(&format!("{{\"type\":\"record\",\"name\":\"R{n}\",\"fields\":[")); + for (i, f) in fields.iter().enumerate() { + if i > 0 { + out.push(','); + } + out.push_str(&format!("{{\"name\":\"g{i}\",\"type\":")); + ty_to_json(f, out); + out.push('}'); + } + out.push_str("]}"); + } + Ty::Array(item) => { + out.push_str("{\"type\":\"array\",\"items\":"); + ty_to_json(item, out); + out.push('}'); + } + Ty::Map(values) => { + out.push_str("{\"type\":\"map\",\"values\":"); + ty_to_json(values, out); + out.push('}'); + } + Ty::Nullable(inner) => { + out.push_str("[\"null\","); + ty_to_json(inner, out); + out.push(']'); + } + Ty::EssentialUnion(has_null, variants) => { + out.push('['); + if *has_null { + out.push_str("\"null\""); + } + for (i, v) in variants.iter().enumerate() { + if *has_null || i > 0 { + out.push(','); + } + ty_to_json(v, out); + } + out.push(']'); + } + } +} + +/// Avro encodes int/long as zig-zag varints. +fn encode_long(n: i64, out: &mut Vec) { + let mut z = ((n << 1) ^ (n >> 63)) as u64; + loop { + if z & !0x7f == 0 { + out.push(z as u8); + return; + } + out.push(((z & 0x7f) | 0x80) as u8); + z >>= 7; + } +} + +/// Encode a length-prefixed run of printable ASCII (valid UTF-8 so `string` +/// decode succeeds and we reach the deep paths). +fn encode_str(u: &mut Unstructured, out: &mut Vec) -> arbitrary::Result<()> { + let n = u.int_in_range(0usize..=8)?; + let mut s = Vec::with_capacity(n); + for _ in 0..n { + s.push(u.int_in_range(0x20u8..=0x7e)?); + } + encode_long(s.len() as i64, out); + out.extend_from_slice(&s); + Ok(()) +} + +/// Encode a length-prefixed run of arbitrary bytes. +fn encode_bytes(u: &mut Unstructured, max: usize, out: &mut Vec) -> arbitrary::Result<()> { + let n = u.int_in_range(0usize..=max)?; + encode_long(n as i64, out); + for _ in 0..n { + out.push(u.arbitrary::()?); + } + Ok(()) +} + +/// Encode a length-prefixed run of valid JSON text, the wire form of a `json` +/// logical field, which the decoder feeds to `serde_json::from_slice`. A small +/// menu of well-formed JSON values covers the `JsonbPacker` shapes (scalars, +/// nested object/array). A fraction of the time we emit deliberately invalid +/// text to keep the `BadJson` error path covered. +fn encode_json(u: &mut Unstructured, out: &mut Vec) -> arbitrary::Result<()> { + let s: String = match u.int_in_range(0u8..=8)? { + 0 => "null".into(), + 1 => "true".into(), + 2 => format!("{}", u.arbitrary::()?), + 3 => format!("{}.5", u.arbitrary::()?), + 4 => { + let n = u.int_in_range(0usize..=6)?; + let mut t = String::from("\""); + for _ in 0..n { + t.push(char::from(u.int_in_range(0x20u8..=0x7e)?).max('a')); + } + t.push('"'); + t + } + 5 => "[]".into(), + 6 => "{}".into(), + 7 => format!("[{},{},null]", u.arbitrary::()?, u.arbitrary::()?), + // Not valid JSON: exercise the decoder's BadJson error path. + _ => "{".into(), + }; + encode_long(s.len() as i64, out); + out.extend_from_slice(s.as_bytes()); + Ok(()) +} + +/// Push `len` bytes of a two's-complement `decimal` value, biased toward the +/// sign-extension and boundary patterns that exercise the most intricate decode +/// arithmetic in `numeric::twos_complement_be_to_numeric`: the narrow/wide split +/// (`len <= 17`), the `negate_twos_complement_le` path (any negative value), and +/// the wide-representation precision-overflow handling. Most of the time it +/// still emits a uniform-random run so coverage guidance keeps exploring. +fn push_twos_complement(u: &mut Unstructured, len: usize, out: &mut Vec) -> arbitrary::Result<()> { + let fill = match u.int_in_range(0u8..=9)? { + 0 => 0x00u8, // zero / positive sign-extension + 1 => 0xFF, // -1 / negative sign-extension + 2 => 0x80, // sign bit set: most-negative leading byte + 3 => 0x7F, // largest-magnitude positive leading byte + // Uniform-random run the majority of the time. + _ => { + for _ in 0..len { + out.push(u.arbitrary::()?); + } + return Ok(()); + } + }; + // A constant run, optionally with one differing leading byte to drop a + // boundary value into an otherwise sign-extended field. + let lead = if len > 0 && u.int_in_range(0u8..=1)? == 1 { + Some(u.arbitrary::()?) + } else { + None + }; + for i in 0..len { + out.push(if i == 0 { lead.unwrap_or(fill) } else { fill }); + } + Ok(()) +} + +/// Pick the length of a `decimal`-over-`bytes` run, biased to the lengths that +/// bracket the narrow/wide split (17) and the wide/overflow regime, plus the +/// degenerate empty and single-byte cases. +fn gen_decimal_len(u: &mut Unstructured) -> arbitrary::Result { + Ok(match u.int_in_range(0u8..=9)? { + 0 => 0, + 1 => 1, + 2 => 16, + 3 => 17, + 4 => 18, + 5 => 40, + _ => u.int_in_range(0usize..=40)?, + }) +} + +/// Avro-binary-encode one random value of type `ty` into `out`. +fn encode_value(u: &mut Unstructured, ty: &Ty, out: &mut Vec) -> arbitrary::Result<()> { + match ty { + Ty::Bool => out.push(u.int_in_range(0u8..=1)?), + Ty::Int | Ty::Date => encode_long(i64::from(u.arbitrary::()?), out), + Ty::Long | Ty::TimestampMillis | Ty::TimestampMicros => { + encode_long(u.arbitrary::()?, out) + } + Ty::Float => out.extend_from_slice(&u.arbitrary::()?.to_le_bytes()), + Ty::Double => out.extend_from_slice(&u.arbitrary::()?.to_le_bytes()), + Ty::String => encode_str(u, out)?, + Ty::Bytes => encode_bytes(u, 12, out)?, + // Length-prefixed unscaled two's-complement run. The length (incl. the + // empty and narrow/wide-boundary cases) and the byte pattern are both + // biased toward the values that stress the decimal decode arithmetic. A + // zero-length run is the empty-decimal == 0 case (regression for the + // numeric.rs out-of-bounds panic). + Ty::DecimalBytes(_, _) => { + let len = gen_decimal_len(u)?; + encode_long(len as i64, out); + push_twos_complement(u, len, out)?; + } + // The fixed-backed decimal reads exactly `size` bytes (no length prefix), + // so only the byte pattern varies. + Ty::DecimalFixed(_, size, _, _) => { + push_twos_complement(u, usize::try_from(*size).expect("fixed size fits usize"), out)?; + } + Ty::Json => encode_json(u, out)?, + Ty::Uuid => { + let b: [u8; 16] = u.arbitrary()?; + let mut s = String::with_capacity(36); + for (i, byte) in b.iter().enumerate() { + if matches!(i, 4 | 6 | 8 | 10) { + s.push('-'); + } + s.push_str(&format!("{byte:02x}")); + } + encode_long(s.len() as i64, out); + out.extend_from_slice(s.as_bytes()); + } + Ty::Fixed(_, size) => { + for _ in 0..*size { + out.push(u.arbitrary::()?); + } + } + // Symbols are A/B/C (indices 0..=2). Index 3 is an out-of-range index + // the decoder must reject without panicking. + Ty::Enum(_) => encode_long(u.int_in_range(0i64..=3)?, out), + Ty::Record(_, fields) => { + for f in fields { + encode_value(u, f, out)?; + } + } + Ty::Array(item) => { + let n = u.int_in_range(0i64..=3)?; + if n > 0 { + encode_long(n, out); + for _ in 0..n { + encode_value(u, item, out)?; + } + } + encode_long(0, out); // end-of-blocks marker + } + Ty::Map(values) => { + let n = u.int_in_range(0i64..=3)?; + if n > 0 { + encode_long(n, out); + for _ in 0..n { + encode_str(u, out)?; // key + encode_value(u, values, out)?; + } + } + encode_long(0, out); + } + // union ["null", T]: branch 0 = null (no bytes), branch 1 = T. + Ty::Nullable(inner) => { + if u.int_in_range(0u8..=3)? == 0 { + encode_long(0, out); + } else { + encode_long(1, out); + encode_value(u, inner, out)?; + } + } + // Essential union: branches are written in declaration order. With a + // leading `null`, branch 0 = null and branch `i+1` = variant `i`. + // Without, branch `i` = variant `i`. Pick one branch and encode it. + Ty::EssentialUnion(has_null, variants) => { + let nbranches = variants.len() + usize::from(*has_null); + let branch = u.int_in_range(0usize..=nbranches - 1)?; + encode_long(branch as i64, out); + if *has_null { + if branch > 0 { + encode_value(u, &variants[branch - 1], out)?; + } + } else { + encode_value(u, &variants[branch], out)?; + } + } + } + Ok(()) +} + +/// Whether *every* value `encode_value` can emit for `ty` is guaranteed to +/// decode without error, i.e. the type carries no value-level validation the +/// encoder can (deliberately) violate. When this holds for an uncorrupted body, +/// a decode failure is a real bug, not an expected error path, so the caller +/// can assert success instead of falling back to the panic-only oracle. +/// +/// This intentionally EXCLUDES the types the encoder can drive out of their +/// target domain, each of which the decoder is *supposed* to reject: +/// * `enum`: the encoder emits an out-of-range symbol index (3). +/// * `json`: the encoder sometimes emits invalid JSON (the `BadJson` path). +/// * `decimal` (bytes / fixed): an unscaled run can exceed the precision. +/// * `date` / `timestamp-*`: an arbitrary `int` / `long` can fall outside the +/// representable chrono range. +/// * `uuid`: kept out for safety even though the encoder writes canonical +/// hex (the decoder still UTF-8- and `Uuid::parse_str`-validates it). +/// +/// Everything left always round-trips for any value the encoder produces: +/// * plain scalars: `bool` (0/1), `int`/`long` (canonical varints), +/// `float`/`double` (raw IEEE bytes, NaN included), `string` (the encoder +/// writes printable ASCII, which is valid UTF-8), `bytes`, and `fixed` +/// (raw byte runs, no validation). +/// * structural composites: records, arrays, maps (the decoder dedups keys +/// via a `BTreeMap`, so even colliding ASCII keys are fine), and the +/// `["null", T]` / essential unions, provided their children are decodable. +fn decode_infallible(ty: &Ty) -> bool { + match ty { + Ty::Bool + | Ty::Int + | Ty::Long + | Ty::Float + | Ty::Double + | Ty::String + | Ty::Bytes + | Ty::Fixed(_, _) => true, + Ty::DecimalBytes(_, _) + | Ty::DecimalFixed(_, _, _, _) + | Ty::Date + | Ty::TimestampMillis + | Ty::TimestampMicros + | Ty::Uuid + | Ty::Json + | Ty::Enum(_) => false, + Ty::Record(_, fields) => fields.iter().all(decode_infallible), + Ty::Array(item) | Ty::Map(item) | Ty::Nullable(item) => decode_infallible(item), + Ty::EssentialUnion(_, variants) => variants.iter().all(decode_infallible), + } +} + +/// A plain-scalar column for the round-trip correctness oracle. Restricted to +/// the types whose decoded `Datum` is exactly determined by the encoded value, +/// with no float NaN, no logical-type conversion, no multi-column union +/// expansion. +#[derive(Clone)] +enum RtVal { + Null, + Bool(bool), + Int(i32), + Long(i64), + Str(String), + Bytes(Vec), +} + +impl RtVal { + /// The `Datum` the Avro decoder must produce for this value (see the + /// `AvroFlatDecoder` scalar handlers: `int`->`Int32`, `long`->`Int64`, + /// `boolean`->`True`/`False`, `string`->`String`, `bytes`->`Bytes`, and a + /// selected-null `["null", T]` branch -> a single `Datum::Null`). + fn datum(&self) -> Datum<'_> { + match self { + RtVal::Null => Datum::Null, + RtVal::Bool(true) => Datum::True, + RtVal::Bool(false) => Datum::False, + RtVal::Int(v) => Datum::Int32(*v), + RtVal::Long(v) => Datum::Int64(*v), + RtVal::Str(s) => Datum::String(s), + RtVal::Bytes(b) => Datum::Bytes(b), + } + } + + /// Avro-binary-encode this value as the body of its column. `nullable` + /// columns are `["null", T]`: branch 0 = null, branch 1 = the value. + fn encode(&self, nullable: bool, out: &mut Vec) { + if nullable { + encode_long(if matches!(self, RtVal::Null) { 0 } else { 1 }, out); + } + match self { + RtVal::Null => {} // null branch already written + RtVal::Bool(b) => out.push(u8::from(*b)), + RtVal::Int(v) => encode_long(i64::from(*v), out), + RtVal::Long(v) => encode_long(*v, out), + RtVal::Str(s) => { + encode_long(s.len() as i64, out); + out.extend_from_slice(s.as_bytes()); + } + RtVal::Bytes(b) => { + encode_long(b.len() as i64, out); + out.extend_from_slice(b); + } + } + } +} + +/// Round-trip *correctness* oracle. The panic / decode-success oracles only see +/// crashes and wrongly-rejected input. A decode that succeeds but yields the +/// *wrong* datum slips past both. So: build a record of plain scalars whose +/// decoded `Datum` is fully determined, encode it, decode it, and assert the +/// decoded `Row` is exactly the values we put in. Restricted to +/// bool/int/long/string/bytes (and their `["null", T]` form), trivial, exact +/// `Datum` mappings with no float NaN, logical-type, or union-column-expansion +/// ambiguity, so any mismatch is a real decoder bug, never a harness artifact. +fn run_roundtrip(u: &mut Unstructured) -> arbitrary::Result<()> { + let ncols = u.int_in_range(1usize..=8)?; + let mut cols: Vec<(RtVal, bool)> = Vec::with_capacity(ncols); + for _ in 0..ncols { + let nullable = u.int_in_range(0u8..=1)? == 1; + // A nullable column is null a quarter of the time. + let val = if nullable && u.int_in_range(0u8..=3)? == 0 { + RtVal::Null + } else { + match u.int_in_range(0u8..=4)? { + 0 => RtVal::Bool(u.int_in_range(0u8..=1)? == 1), + 1 => RtVal::Int(u.arbitrary::()?), + 2 => RtVal::Long(u.arbitrary::()?), + 3 => { + let n = u.int_in_range(0usize..=8)?; + let mut s = String::with_capacity(n); + for _ in 0..n { + s.push(char::from(u.int_in_range(0x20u8..=0x7e)?)); + } + RtVal::Str(s) + } + _ => { + let n = u.int_in_range(0usize..=8)?; + let mut b = Vec::with_capacity(n); + for _ in 0..n { + b.push(u.arbitrary::()?); + } + RtVal::Bytes(b) + } + } + }; + cols.push((val, nullable)); + } + + // Reader schema: a record of the chosen columns. + let mut schema = String::from("{\"type\":\"record\",\"name\":\"RT\",\"fields\":["); + for (i, (val, nullable)) in cols.iter().enumerate() { + if i > 0 { + schema.push(','); + } + let t = match val { + // For a null cell the column type only needs to be *some* scalar. + // The decoded datum is `Null` regardless. Use `int` as a stand-in. + RtVal::Null | RtVal::Int(_) => "\"int\"", + RtVal::Bool(_) => "\"boolean\"", + RtVal::Long(_) => "\"long\"", + RtVal::Str(_) => "\"string\"", + RtVal::Bytes(_) => "\"bytes\"", + }; + if *nullable { + schema.push_str(&format!("{{\"name\":\"c{i}\",\"type\":[\"null\",{t}]}}")); + } else { + schema.push_str(&format!("{{\"name\":\"c{i}\",\"type\":{t}}}")); + } + } + schema.push_str("]}"); + + let Ok(mut decoder) = Decoder::new(&schema, &[], WriterSchemaProvider::None, "fuzz".into()) else { + // A record of plain scalars always validates. If not, nothing to check. + return Ok(()); + }; + + let mut body = Vec::new(); + for (val, nullable) in &cols { + val.encode(*nullable, &mut body); + } + + let mut expected = Row::default(); + { + let mut packer = expected.packer(); + for (val, _) in &cols { + packer.push(val.datum()); + } + } + + let mut bytes = body.as_slice(); + match rt().block_on(decoder.decode(&mut bytes)) { + Ok(Ok(decoded)) => assert_eq!( + decoded, expected, + "Avro decode produced the wrong row for a plain-scalar record\nschema = {schema}", + ), + Ok(Err(e)) => panic!("plain-scalar record failed to decode (schema = {schema}): {e}"), + Err(e) => panic!("plain-scalar record hit a transient decode error (schema = {schema}): {e}"), + } + Ok(()) +} + +fn run(mut u: Unstructured) -> arbitrary::Result<()> { + // A fraction of inputs go to the round-trip correctness oracle (decoded row + // must equal the encoded values). The rest drive the schema-directed + // panic / decode-success target below. + if u.int_in_range(0u8..=3)? == 0 { + return run_roundtrip(&mut u); + } + + // Top-level reader schema: a record whose fields span everything + // `validate_schema_2` accepts. + let mut counter = 0u32; + let nfields = u.int_in_range(1u8..=8)?; + let mut fields = Vec::with_capacity(nfields.into()); + for _ in 0..nfields { + // Record fields, so essential unions (`gen_field`) are reachable here. + fields.push(gen_field(&mut u, &mut counter, 3)?); + } + let row = Ty::Record(0, fields); + + let mut schema = String::new(); + ty_to_json(&row, &mut schema); + + // No CSR client and confluent_wire_format = false, so decode is + // self-contained (no network) over the generated reader schema. + let Ok(mut decoder) = Decoder::new(&schema, &[], WriterSchemaProvider::None, "fuzz".into()) else { + return Ok(()); + }; + + // Body: usually a valid encoding (so the decoder runs deep), but a quarter + // of the time the raw remaining bytes, and otherwise a valid encoding + // occasionally truncated or single-byte corrupted. The non-valid forms keep + // the decoder's error paths covered: short read, bad length/union tag, + // inconsistent content. + // `assert_success` is set only when the body is a *clean* (uncorrupted) + // encoding of a type tree every node of which is guaranteed to decode for + // any value the encoder can produce (see `decode_infallible`). In that one + // case the decoder is round-tripping bytes it must accept, so a decode + // *error* is a real bug. The panic-only oracle that the random-bytes, + // truncated, corrupted, and value-validated (`enum`/`decimal`/`json`/…) + // arms rely on would silently miss it, the same class of regression as the + // deferred union-promotion error in #37087. + let mut assert_success = false; + let body = if u.int_in_range(0u8..=3)? == 0 { + u.take_rest().to_vec() + } else { + let mut b = Vec::new(); + encode_value(&mut u, &row, &mut b)?; + match u.int_in_range(0u8..=9)? { + 0 if !b.is_empty() => { + let keep = u.int_in_range(0usize..=b.len())?; + b.truncate(keep); + } + 1 if !b.is_empty() => { + let i = u.int_in_range(0usize..=b.len() - 1)?; + b[i] ^= u.arbitrary::()?; + } + // Uncorrupted body: if its type is guaranteed decodable, the decode + // below must succeed. + _ => assert_success = decode_infallible(&row), + } + b + }; + + let mut bytes = body.as_slice(); + let result = rt().block_on(decoder.decode(&mut bytes)); + if assert_success { + let _ = result.expect( + "decoder rejected a clean, in-range Avro body whose every field is guaranteed \ + decodable; such a round-trip must succeed", + ); + } + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let _ = run(Unstructured::new(data)); +}); diff --git a/src/interchange/fuzz/fuzz_targets/json_encode.rs b/src/interchange/fuzz/fuzz_targets/json_encode.rs new file mode 100644 index 0000000000000..f660328db5424 --- /dev/null +++ b/src/interchange/fuzz/fuzz_targets/json_encode.rs @@ -0,0 +1,430 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `mz_interchange::json::encode_datums_as_json` formats a `Row`'s +//! datums as JSON, the output path of a `FORMAT JSON` Kafka sink. (The JSON +//! *source* decoder is just `Jsonb::from_slice`, already covered by +//! `repr::jsonb_from_slice`.) Each datum is rendered per its column type, which +//! has real per-type logic: floats (a JSON `Number` can't be NaN/Infinity), +//! numerics, intervals, timestamps, dates, bytes. A panic there, or a value +//! that can't be serialized, corrupts/halts a sink, so encoding then serializing +//! must never panic for any well-typed row. +//! +//! Beyond scalars, we generate *composite* column types (`List`, `Map`, +//! `Record`, multi-dimensional `Array`, and `Jsonb`) plus the remaining scalar +//! shapes with their own encode logic (`Uuid`, `Char`/`VarChar` padding, +//! `Range`, `MzAclItem`/`AclItem`). These drive the recursive encode paths that +//! a scalar-only schema never reaches: `encode_array`'s dimension walk, the +//! `Record`/`Map` field iteration (a `zip_eq` against the column type), and the +//! `Jsonb -> serde_json` conversion. A `Row` ultimately comes from a source, so +//! the *values* are arbitrary (NaN/Infinity floats, huge collections, deeply +//! nested records), but they must be *well-typed* against the schema: we +//! generate the type and a structurally matching datum in lockstep, so any panic +//! is in the encoder, not a type/datum mismatch. + +#![no_main] + +use chrono::{DateTime, NaiveTime, Utc}; +use libfuzzer_sys::arbitrary::{self, Arbitrary, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_interchange::json::encode_datums_as_json; +use mz_repr::adt::char::CharLength; +use mz_repr::adt::date::Date; +use mz_repr::adt::interval::Interval; +use mz_repr::adt::jsonb::JsonbPacker; +use mz_repr::adt::mz_acl_item::{AclItem, MzAclItem}; +use mz_repr::adt::range::Range; +use mz_repr::adt::system::Oid; +use mz_repr::adt::timestamp::CheckedTimestamp; +use mz_repr::adt::varchar::VarCharMaxLength; +use mz_repr::role_id::RoleId; +use mz_repr::strconv::parse_numeric; +use mz_repr::{ColumnName, Datum, Row, RowPacker, SqlColumnType, SqlScalarType, Timestamp}; +use uuid::Uuid; + +/// A generated column type. Scalars are the leaves. The composite variants nest +/// (with a depth bound) to drive the recursive encode paths. We keep the type +/// structured so that `push_typed` can pack a value that matches it exactly. +enum GType { + Scalar(SqlScalarType), + /// SQL `Jsonb`, packed via `JsonbPacker`, rendered by `to_serde_json`. + Jsonb, + /// `LIST` of `(nullable element, element type)`. + List(bool, Box), + /// `MAP` of string -> `(nullable value, value type)`. + Map(bool, Box), + /// `RECORD` of `(field nullable, field type)` columns. Encoding `zip_eq`s + /// the field types against the packed list, so they must line up. + Record(Vec<(bool, GType)>), + /// Multi-dimensional `ARRAY` of a scalar element. `dims` are the per-axis + /// lengths. The total element count is their product. + Array(Vec, SqlScalarType), +} + +fn gen_naive_ts( + u: &mut Unstructured, +) -> arbitrary::Result> { + let secs = u.int_in_range(-8_000_000_000_000i64..=8_000_000_000_000)?; + let nanos = u.int_in_range(0u32..=999_999_999)?; + Ok(DateTime::from_timestamp(secs, nanos) + .and_then(|d| CheckedTimestamp::from_timestamplike(d.naive_utc()).ok()) + .unwrap_or_else(|| { + CheckedTimestamp::from_timestamplike( + DateTime::from_timestamp(0, 0).unwrap().naive_utc(), + ) + .unwrap() + })) +} + +fn gen_utc_ts(u: &mut Unstructured) -> arbitrary::Result>> { + let secs = u.int_in_range(-8_000_000_000_000i64..=8_000_000_000_000)?; + let nanos = u.int_in_range(0u32..=999_999_999)?; + Ok(DateTime::from_timestamp(secs, nanos) + .and_then(|d| CheckedTimestamp::from_timestamplike(d).ok()) + .unwrap_or_else(|| { + CheckedTimestamp::from_timestamplike(DateTime::from_timestamp(0, 0).unwrap()).unwrap() + })) +} + +/// Generate a scalar type (a leaf of the composite tree, and the element type of +/// arrays). Every type here is handled by `push_scalar`. +fn gen_scalar_type(u: &mut Unstructured) -> arbitrary::Result { + Ok(match u.int_in_range(0u8..=24)? { + 0 => SqlScalarType::Bool, + 1 => SqlScalarType::Int16, + 2 => SqlScalarType::Int32, + 3 => SqlScalarType::Int64, + 4 => SqlScalarType::UInt16, + 5 => SqlScalarType::UInt32, + 6 => SqlScalarType::UInt64, + 7 => SqlScalarType::PgLegacyChar, + 8 => SqlScalarType::Float32, + 9 => SqlScalarType::Float64, + 10 => SqlScalarType::String, + 11 => SqlScalarType::Bytes, + 12 => SqlScalarType::Numeric { max_scale: None }, + 13 => SqlScalarType::Date, + 14 => SqlScalarType::Time, + 15 => SqlScalarType::Timestamp { precision: None }, + 16 => SqlScalarType::TimestampTz { precision: None }, + 17 => SqlScalarType::Interval, + 18 => SqlScalarType::MzTimestamp, + 19 => SqlScalarType::Uuid, + 20 => { + let length = CharLength::try_from(i64::from(u.int_in_range(1u8..=12)?)).ok(); + SqlScalarType::Char { length } + } + 21 => { + let max_length = VarCharMaxLength::try_from(i64::from(u.int_in_range(1u8..=12)?)).ok(); + SqlScalarType::VarChar { max_length } + } + // Range over Int32. `unwrap_range().to_string()` is the only logic. + 22 => SqlScalarType::Range { + element_type: Box::new(SqlScalarType::Int32), + }, + 23 => SqlScalarType::MzAclItem, + _ => SqlScalarType::AclItem, + }) +} + +/// Generate a (possibly composite) column type, bounded by `depth`. +fn gen_type(u: &mut Unstructured, depth: u32) -> arbitrary::Result { + if depth == 0 || u.is_empty() { + return Ok(match u.int_in_range(0u8..=1)? { + 0 => GType::Jsonb, + _ => GType::Scalar(gen_scalar_type(u)?), + }); + } + Ok(match u.int_in_range(0u8..=6)? { + 0 | 1 => GType::Scalar(gen_scalar_type(u)?), + 2 => GType::Jsonb, + 3 => GType::List(bool::arbitrary(u)?, Box::new(gen_type(u, depth - 1)?)), + 4 => GType::Map(bool::arbitrary(u)?, Box::new(gen_type(u, depth - 1)?)), + 5 => { + let n = u.int_in_range(0u8..=4)?; + let mut fields = Vec::with_capacity(n.into()); + for _ in 0..n { + fields.push((bool::arbitrary(u)?, gen_type(u, depth - 1)?)); + } + GType::Record(fields) + } + // Arrays nest only scalars (Materialize arrays don't hold collections), + // but can have multiple dimensions. + _ => { + let ndims = u.int_in_range(1usize..=3)?; + let mut dims = Vec::with_capacity(ndims); + for _ in 0..ndims { + dims.push(u.int_in_range(0usize..=3)?); + } + GType::Array(dims, gen_scalar_type(u)?) + } + }) +} + +/// The `SqlScalarType` corresponding to a `GType`, for the column schema. +fn to_scalar_type(ty: >ype) -> SqlScalarType { + match ty { + GType::Scalar(s) => s.clone(), + GType::Jsonb => SqlScalarType::Jsonb, + GType::List(_, elem) => SqlScalarType::List { + element_type: Box::new(to_scalar_type(elem)), + custom_id: None, + }, + GType::Map(_, value) => SqlScalarType::Map { + value_type: Box::new(to_scalar_type(value)), + custom_id: None, + }, + GType::Record(fields) => SqlScalarType::Record { + fields: fields + .iter() + .enumerate() + .map(|(i, (nullable, ty))| { + ( + ColumnName::from(format!("f{i}")), + SqlColumnType { + scalar_type: to_scalar_type(ty), + nullable: *nullable, + }, + ) + }) + .collect(), + custom_id: None, + }, + GType::Array(_, elem) => SqlScalarType::Array(Box::new(elem.clone())), + } +} + +/// Build a small, valid JSON string of bounded depth for the `Jsonb` path. +fn gen_json_text(u: &mut Unstructured) -> arbitrary::Result { + Ok(match u.int_in_range(0u8..=7)? { + 0 => "null".into(), + 1 => "true".into(), + 2 => format!("{}", u.arbitrary::()?), + 3 => format!("{}.25", u.arbitrary::()?), + 4 => "\"s\"".into(), + 5 => "[1,2,3]".into(), + 6 => "{\"a\":1,\"b\":[true,null]}".into(), + _ => "[]".into(), + }) +} + +/// Push one scalar datum of `ty` (or NULL when allowed). Out-of-range +/// constructed values fall back to a valid datum so the row always matches the +/// column type. +fn push_scalar( + packer: &mut RowPacker, + u: &mut Unstructured, + ty: &SqlScalarType, + nullable: bool, +) -> arbitrary::Result<()> { + if nullable && u.ratio(1u8, 8u8)? { + packer.push(Datum::Null); + return Ok(()); + } + match ty { + SqlScalarType::Bool => packer.push(if bool::arbitrary(u)? { + Datum::True + } else { + Datum::False + }), + SqlScalarType::Int16 => packer.push(Datum::Int16(i16::arbitrary(u)?)), + SqlScalarType::Int32 => packer.push(Datum::Int32(i32::arbitrary(u)?)), + SqlScalarType::Int64 => packer.push(Datum::Int64(i64::arbitrary(u)?)), + SqlScalarType::UInt16 => packer.push(Datum::UInt16(u16::arbitrary(u)?)), + SqlScalarType::UInt32 => packer.push(Datum::UInt32(u32::arbitrary(u)?)), + SqlScalarType::UInt64 => packer.push(Datum::UInt64(u64::arbitrary(u)?)), + SqlScalarType::PgLegacyChar => packer.push(Datum::UInt8(u8::arbitrary(u)?)), + // `f{32,64}::arbitrary` produces NaN/Infinity, which a JSON Number cannot + // represent, exactly the encoding edge we want to exercise. + SqlScalarType::Float32 => packer.push(Datum::Float32(f32::arbitrary(u)?.into())), + SqlScalarType::Float64 => packer.push(Datum::Float64(f64::arbitrary(u)?.into())), + SqlScalarType::String + | SqlScalarType::VarChar { .. } + | SqlScalarType::Char { .. } => packer.push(Datum::String(<&str>::arbitrary(u)?)), + SqlScalarType::Bytes => packer.push(Datum::Bytes(<&[u8]>::arbitrary(u)?)), + SqlScalarType::Numeric { .. } => { + let s = format!("{}.{}", i64::arbitrary(u)?, u32::arbitrary(u)?); + let n = parse_numeric(&s).unwrap_or_else(|_| parse_numeric("0").unwrap()); + packer.push(Datum::Numeric(n)); + } + SqlScalarType::Date => { + let d = Date::from_pg_epoch(i32::arbitrary(u)?) + .unwrap_or_else(|_| Date::from_pg_epoch(0).unwrap()); + packer.push(Datum::Date(d)); + } + SqlScalarType::Time => { + let secs = u.int_in_range(0u32..=86_399)?; + let nanos = u.int_in_range(0u32..=999_999_999)?; + let t = NaiveTime::from_num_seconds_from_midnight_opt(secs, nanos).unwrap(); + packer.push(Datum::Time(t)); + } + SqlScalarType::Timestamp { .. } => packer.push(Datum::Timestamp(gen_naive_ts(u)?)), + SqlScalarType::TimestampTz { .. } => packer.push(Datum::TimestampTz(gen_utc_ts(u)?)), + SqlScalarType::Interval => packer.push(Datum::Interval(Interval::new( + i32::arbitrary(u)?, + i32::arbitrary(u)?, + i64::arbitrary(u)?, + ))), + SqlScalarType::MzTimestamp => { + packer.push(Datum::MzTimestamp(Timestamp::from(u64::arbitrary(u)?))) + } + SqlScalarType::Uuid => packer.push(Datum::Uuid(Uuid::from_u128(u.arbitrary::()?))), + // The empty range is always valid and its `to_string` is exercised. + SqlScalarType::Range { .. } => packer.push(Datum::Range(Range { inner: None })), + SqlScalarType::MzAclItem => { + let item = MzAclItem::empty(RoleId::User(u.arbitrary::()?), RoleId::Public); + packer.push(Datum::MzAclItem(item)); + } + SqlScalarType::AclItem => { + let item = AclItem::empty(Oid(u.arbitrary::()?), Oid(u.arbitrary::()?)); + packer.push(Datum::AclItem(item)); + } + // `gen_scalar_type` never produces other types. + _ => packer.push(Datum::Null), + } + Ok(()) +} + +/// Push a value of `ty` (or NULL when allowed) into `packer`, recursing into +/// composites so the packed structure matches `to_scalar_type(ty)` exactly. +fn push_typed( + packer: &mut RowPacker, + u: &mut Unstructured, + ty: >ype, + nullable: bool, +) -> arbitrary::Result<()> { + match ty { + GType::Scalar(s) => push_scalar(packer, u, s, nullable)?, + GType::Jsonb => { + if nullable && u.ratio(1u8, 8u8)? { + packer.push(Datum::Null); + } else { + let text = gen_json_text(u)?; + // The text is always valid JSON by construction. + JsonbPacker::new(packer) + .pack_str(&text) + .expect("generated valid json"); + } + } + GType::List(elem_nullable, elem) => { + if nullable && u.ratio(1u8, 8u8)? { + packer.push(Datum::Null); + } else { + let n = u.int_in_range(0usize..=4)?; + packer.push_list_with(|packer| { + for _ in 0..n { + push_typed(packer, u, elem, *elem_nullable)?; + } + Ok::<_, arbitrary::Error>(()) + })?; + } + } + GType::Map(value_nullable, value) => { + if nullable && u.ratio(1u8, 8u8)? { + packer.push(Datum::Null); + } else { + let n = u.int_in_range(0usize..=4)?; + packer.push_dict_with(|packer| { + // Keys must be pushed in ascending order and be unique. The + // zero-padded counter is lexically ascending. + for i in 0..n { + packer.push(Datum::String(&format!("k{i:03}"))); + push_typed(packer, u, value, *value_nullable)?; + } + Ok::<_, arbitrary::Error>(()) + })?; + } + } + GType::Record(fields) => { + if nullable && u.ratio(1u8, 8u8)? { + packer.push(Datum::Null); + } else { + packer.push_list_with(|packer| { + for (field_nullable, field_ty) in fields { + push_typed(packer, u, field_ty, *field_nullable)?; + } + Ok::<_, arbitrary::Error>(()) + })?; + } + } + GType::Array(dims, elem) => { + if nullable && u.ratio(1u8, 8u8)? { + packer.push(Datum::Null); + return Ok(()); + } + // `try_push_array` requires exactly `prod(dims)` elements. A + // zero-length axis collapses the whole array to empty. + let nelements: usize = dims.iter().product(); + let array_dims: Vec = dims + .iter() + .map(|&length| mz_repr::adt::array::ArrayDimension { + lower_bound: 1, + length, + }) + .collect(); + // Build the elements into a scratch row first (array elements are + // always nullable in the encoder). + let mut scratch = Row::default(); + { + let mut sp = scratch.packer(); + for _ in 0..nelements { + push_scalar(&mut sp, u, elem, true)?; + } + } + let elems: Vec = scratch.iter().collect(); + // The dims and element count match by construction, so this can't + // return an error. If it somehow does, fall back to an empty array. + if packer.try_push_array(&array_dims, elems).is_err() { + packer.try_push_array(&[], std::iter::empty::()).unwrap(); + } + } + } + Ok(()) +} + +fn run(u: &mut Unstructured) -> arbitrary::Result<()> { + let ncols = u.int_in_range(1usize..=6)?; + let mut types: Vec<(bool, GType)> = Vec::with_capacity(ncols); + for _ in 0..ncols { + types.push((bool::arbitrary(u)?, gen_type(u, 3)?)); + } + + let columns: Vec<(ColumnName, SqlColumnType)> = types + .iter() + .enumerate() + .map(|(i, (nullable, ty))| { + ( + ColumnName::from(format!("c{i}")), + SqlColumnType { + scalar_type: to_scalar_type(ty), + nullable: *nullable, + }, + ) + }) + .collect(); + + let mut row = Row::default(); + { + let mut packer = row.packer(); + for (nullable, ty) in &types { + push_typed(&mut packer, u, ty, *nullable)?; + } + } + + // Encode the row's datums as JSON (the sink path) and serialize the result. + // Neither step may panic for any well-typed row. + let value = encode_datums_as_json(row.iter(), &columns); + let _ = serde_json::to_vec(&value); + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let mut u = Unstructured::new(data); + let _ = run(&mut u); +}); diff --git a/src/interchange/fuzz/fuzz_targets/protobuf_decode_fuzzed_schema.rs b/src/interchange/fuzz/fuzz_targets/protobuf_decode_fuzzed_schema.rs new file mode 100644 index 0000000000000..c2cf3c6a0e91b --- /dev/null +++ b/src/interchange/fuzz/fuzz_targets/protobuf_decode_fuzzed_schema.rs @@ -0,0 +1,573 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `mz_interchange::protobuf` over a *fuzzer-generated* descriptor +//! rather than a fixed one. A `FORMAT PROTOBUF` source carries a user-supplied +//! `FileDescriptorSet` (the `USING SCHEMA` blob), so `DecodedDescriptors::from_bytes` +//! and the hand-written `derive_inner_type` validator it runs are themselves an +//! untrusted-input surface, and a fixed descriptor never exercises them with +//! anything but one well-formed schema. The interesting paths are the ones a +//! single descriptor can't reach: recursion detection (a message field +//! referencing its own or a mutually-referencing message must be rejected, not +//! stack-overflow), the full scalar `Kind` mapping, enums, and repeated fields. +//! +//! Critically, we don't feed *random bytes* as the body: prost reads a field +//! tag then skips/short-circuits on most random bytes, so a random body rarely +//! reaches the decode-to-`Row` conversion (scalar `Kind` mapping, enum -> string, +//! repeated -> list, nested message -> record). Instead we keep the schema +//! structured (`MsgDef`/`FieldTy`), build the `FileDescriptorSet` from it, and +//! protobuf-binary-encode a valid body for the top message against that same +//! structure, bounding recursion through cyclic message refs by emitting empty +//! nested messages at the depth limit. Both wire formats are exercised: the +//! Confluent variant gets the 5-byte schema-registry header prepended so its +//! body decodes just as deeply. We keep the error-path coverage a random body +//! gave, though: a quarter of the inputs feed the raw remaining bytes, and +//! others truncate or single-byte-corrupt the valid encoding. Neither +//! validation nor decoding may panic. +//! +//! Beyond the well-formed-schema happy path, we drive the descriptor-validation +//! surface harder. Some schemas grow a `map` field (a repeated +//! field of a `map_entry`-tagged message): `derive_column_type` is supposed to +//! *reject* maps (`field.is_map()`), so this exercises that rejection cleanly +//! rather than only ever feeding it map-free input. Field numbers are +//! fuzzer-chosen in a small range, so gaps, out-of-order, and duplicate numbers +//! all occur. Duplicates are an invalid descriptor `DescriptorPool::decode` +//! must reject without panicking. Packable repeated scalars are sometimes +//! emitted in proto3's packed wire form (one length-delimited blob of +//! back-to-back values) instead of tag-per-element, covering the packed decode +//! path. Finally the *root* message the decoder is pointed at is fuzzer-chosen +//! among the generated messages, not pinned to `M0`. + +#![no_main] + +use libfuzzer_sys::arbitrary::{self, Arbitrary, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_interchange::protobuf::{DecodedDescriptors, Decoder}; +use prost::Message; +use prost_types::field_descriptor_proto::{Label, Type}; +use prost_types::{ + DescriptorProto, EnumDescriptorProto, EnumValueDescriptorProto, FieldDescriptorProto, + FileDescriptorProto, FileDescriptorSet, MessageOptions, +}; + +/// Name of the synthetic `map_entry` message that backs generated `map` fields. +const MAP_ENTRY: &str = "MapEntry"; + +/// A generated field type. Kept structured (rather than going straight to a +/// `FieldDescriptorProto`) so the body encoder can walk the exact same shape. +/// A repeated field is one descriptor entry but N encoded values, and a message +/// field needs the referenced message's structure to encode a nested value. +enum FieldTy { + Int32, + Int64, + Uint32, + Uint64, + Sint32, + Sint64, + Fixed32, + Fixed64, + Sfixed32, + Sfixed64, + Bool, + StringT, + Bytes, + Float, + Double, + /// Reference to enum `E{0}`. + Enum(u8), + /// Reference to message `M{0}`, may equal the containing message (a cycle). + Message(u8), + /// `map`, a repeated reference to the `map_entry`-tagged + /// `MapEntry` message. `derive_column_type` rejects this, so the descriptor + /// it appears in fails validation (exercising the map-rejection path). + Map, +} + +struct FieldDef { + number: i32, + ty: FieldTy, + repeated: bool, +} + +struct MsgDef { + fields: Vec, +} + +/// Generate the message/enum structure. `enum_nvals[j]` is the number of values +/// in enum `E{j}`. +fn gen_schema(u: &mut Unstructured) -> arbitrary::Result<(Vec, Vec)> { + let num_msgs = u.int_in_range(1u8..=4)?; + let num_enums = u.int_in_range(0u8..=2)?; + + let mut enum_nvals = Vec::with_capacity(num_enums.into()); + for _ in 0..num_enums { + enum_nvals.push(u.int_in_range(1u8..=3)?); + } + + let mut msgs = Vec::with_capacity(num_msgs.into()); + for _ in 0..num_msgs { + let nfields = u.int_in_range(0u8..=6)?; + let mut fields = Vec::with_capacity(nfields.into()); + for fi in 0..nfields { + let repeated = bool::arbitrary(u)?; + let ty = match u.int_in_range(0u8..=17)? { + 0 => FieldTy::Int32, + 1 => FieldTy::Int64, + 2 => FieldTy::Uint32, + 3 => FieldTy::Uint64, + 4 => FieldTy::Sint32, + 5 => FieldTy::Sint64, + 6 => FieldTy::Fixed32, + 7 => FieldTy::Fixed64, + 8 => FieldTy::Sfixed32, + 9 => FieldTy::Sfixed64, + 10 => FieldTy::Bool, + 11 => FieldTy::StringT, + 12 => FieldTy::Bytes, + 13 => FieldTy::Float, + 14 => FieldTy::Double, + 15 if num_enums > 0 => FieldTy::Enum(u.int_in_range(0u8..=num_enums - 1)?), + 16 => FieldTy::Map, + _ => FieldTy::Message(u.int_in_range(0u8..=num_msgs - 1)?), + }; + // Field numbers are usually the sequential `fi + 1`, but sometimes a + // fuzzer-chosen small number, producing gaps, reordering, and + // duplicate numbers (the last being an invalid descriptor that + // `DescriptorPool::decode` must reject without panicking). + let number = if u.int_in_range(0u8..=3)? == 0 { + i32::from(u.int_in_range(1u8..=8)?) + } else { + i32::from(fi) + 1 + }; + fields.push(FieldDef { + number, + ty, + repeated, + }); + } + msgs.push(MsgDef { fields }); + } + + Ok((msgs, enum_nvals)) +} + +fn field_proto(f: &FieldDef) -> FieldDescriptorProto { + let (ty, type_name) = match &f.ty { + FieldTy::Int32 => (Type::Int32, None), + FieldTy::Int64 => (Type::Int64, None), + FieldTy::Uint32 => (Type::Uint32, None), + FieldTy::Uint64 => (Type::Uint64, None), + FieldTy::Sint32 => (Type::Sint32, None), + FieldTy::Sint64 => (Type::Sint64, None), + FieldTy::Fixed32 => (Type::Fixed32, None), + FieldTy::Fixed64 => (Type::Fixed64, None), + FieldTy::Sfixed32 => (Type::Sfixed32, None), + FieldTy::Sfixed64 => (Type::Sfixed64, None), + FieldTy::Bool => (Type::Bool, None), + FieldTy::StringT => (Type::String, None), + FieldTy::Bytes => (Type::Bytes, None), + FieldTy::Float => (Type::Float, None), + FieldTy::Double => (Type::Double, None), + FieldTy::Enum(j) => (Type::Enum, Some(format!(".fuzz.E{j}"))), + FieldTy::Message(r) => (Type::Message, Some(format!(".fuzz.M{r}"))), + // A map field is a repeated reference to the map_entry message. + FieldTy::Map => (Type::Message, Some(format!(".fuzz.{MAP_ENTRY}"))), + }; + // Map fields must be repeated for `is_map()` to fire. + let label = if f.repeated || matches!(f.ty, FieldTy::Map) { + Label::Repeated + } else { + Label::Optional + }; + FieldDescriptorProto { + name: Some(format!("f{}", f.number - 1)), + number: Some(f.number), + label: Some(label as i32), + r#type: Some(ty as i32), + type_name, + ..Default::default() + } +} + +/// Build the `FileDescriptorSet` blob (`USING SCHEMA`) from the structure. +fn build_fds(msgs: &[MsgDef], enum_nvals: &[u8]) -> Vec { + let enums = enum_nvals + .iter() + .enumerate() + .map(|(j, &nvals)| EnumDescriptorProto { + name: Some(format!("E{j}")), + // proto3 requires the first enum value to be zero. + value: (0..nvals) + .map(|v| EnumValueDescriptorProto { + name: Some(format!("E{j}_{v}")), + number: Some(i32::from(v)), + ..Default::default() + }) + .collect(), + ..Default::default() + }) + .collect(); + + let mut messages: Vec = msgs + .iter() + .enumerate() + .map(|(mi, m)| DescriptorProto { + name: Some(format!("M{mi}")), + field: m.fields.iter().map(field_proto).collect(), + ..Default::default() + }) + .collect(); + + // The synthetic `map_entry` message backing `map` fields: + // key = 1, value = 2, with `MessageOptions.map_entry = true` so prost-reflect + // reports `is_map_entry()` (and the referencing field reports `is_map()`). + messages.push(DescriptorProto { + name: Some(MAP_ENTRY.to_owned()), + field: vec![ + FieldDescriptorProto { + name: Some("key".to_owned()), + number: Some(1), + label: Some(Label::Optional as i32), + r#type: Some(Type::String as i32), + ..Default::default() + }, + FieldDescriptorProto { + name: Some("value".to_owned()), + number: Some(2), + label: Some(Label::Optional as i32), + r#type: Some(Type::String as i32), + ..Default::default() + }, + ], + options: Some(MessageOptions { + map_entry: Some(true), + ..Default::default() + }), + ..Default::default() + }); + + let file = FileDescriptorProto { + name: Some("fuzz.proto".to_owned()), + package: Some("fuzz".to_owned()), + message_type: messages, + enum_type: enums, + syntax: Some("proto3".to_owned()), + ..Default::default() + }; + FileDescriptorSet { file: vec![file] }.encode_to_vec() +} + +/// Encode an unsigned protobuf varint. +fn encode_varint(mut v: u64, out: &mut Vec) { + loop { + if v < 0x80 { + out.push(v as u8); + return; + } + out.push((v as u8 & 0x7f) | 0x80); + v >>= 7; + } +} + +/// Encode a field tag: `(field_number << 3) | wire_type`. +fn encode_tag(number: i32, wire_type: u8, out: &mut Vec) { + encode_varint(((number as u64) << 3) | u64::from(wire_type), out); +} + +fn encode_len_delimited(number: i32, bytes: &[u8], out: &mut Vec) { + encode_tag(number, 2, out); + encode_varint(bytes.len() as u64, out); + out.extend_from_slice(bytes); +} + +/// Encode one occurrence of a field (tag + value). +fn encode_field( + msgs: &[MsgDef], + enum_nvals: &[u8], + f: &FieldDef, + depth: u32, + u: &mut Unstructured, + out: &mut Vec, +) -> arbitrary::Result<()> { + let n = f.number; + match &f.ty { + // wire type 0: varint. + FieldTy::Int32 => { + encode_tag(n, 0, out); + encode_varint(i64::from(u.arbitrary::()?) as u64, out); + } + FieldTy::Int64 => { + encode_tag(n, 0, out); + encode_varint(u.arbitrary::()? as u64, out); + } + FieldTy::Uint32 => { + encode_tag(n, 0, out); + encode_varint(u64::from(u.arbitrary::()?), out); + } + FieldTy::Uint64 => { + encode_tag(n, 0, out); + encode_varint(u.arbitrary::()?, out); + } + FieldTy::Sint32 => { + encode_tag(n, 0, out); + let v = u.arbitrary::()?; + encode_varint((((v << 1) ^ (v >> 31)) as u32).into(), out); + } + FieldTy::Sint64 => { + encode_tag(n, 0, out); + let v = u.arbitrary::()?; + encode_varint(((v << 1) ^ (v >> 63)) as u64, out); + } + FieldTy::Bool => { + encode_tag(n, 0, out); + encode_varint(u.int_in_range(0u64..=1)?, out); + } + FieldTy::Enum(j) => { + encode_tag(n, 0, out); + // Valid values are 0..nvals. `nvals` itself is one out-of-range + // index (proto3 open enums accept it, the decoder maps it). + let nvals = u64::from(enum_nvals[usize::from(*j)]); + encode_varint(u.int_in_range(0u64..=nvals)?, out); + } + // wire type 1: 64-bit. + FieldTy::Fixed64 => { + encode_tag(n, 1, out); + out.extend_from_slice(&u.arbitrary::()?.to_le_bytes()); + } + FieldTy::Sfixed64 => { + encode_tag(n, 1, out); + out.extend_from_slice(&u.arbitrary::()?.to_le_bytes()); + } + FieldTy::Double => { + encode_tag(n, 1, out); + out.extend_from_slice(&u.arbitrary::()?.to_le_bytes()); + } + // wire type 5: 32-bit. + FieldTy::Fixed32 => { + encode_tag(n, 5, out); + out.extend_from_slice(&u.arbitrary::()?.to_le_bytes()); + } + FieldTy::Sfixed32 => { + encode_tag(n, 5, out); + out.extend_from_slice(&u.arbitrary::()?.to_le_bytes()); + } + FieldTy::Float => { + encode_tag(n, 5, out); + out.extend_from_slice(&u.arbitrary::()?.to_le_bytes()); + } + // wire type 2: length-delimited. + FieldTy::StringT => { + let len = u.int_in_range(0usize..=8)?; + let mut s = Vec::with_capacity(len); + for _ in 0..len { + s.push(u.int_in_range(0x20u8..=0x7e)?); + } + encode_len_delimited(n, &s, out); + } + FieldTy::Bytes => { + let len = u.int_in_range(0usize..=8)?; + let mut b = Vec::with_capacity(len); + for _ in 0..len { + b.push(u.arbitrary::()?); + } + encode_len_delimited(n, &b, out); + } + FieldTy::Message(r) => { + // Bound recursion through cyclic refs: at the depth limit emit an + // empty (all-defaults) nested message. + let mut nested = Vec::new(); + if depth > 0 { + encode_message(msgs, enum_nvals, usize::from(*r), depth - 1, u, &mut nested)?; + } + encode_len_delimited(n, &nested, out); + } + FieldTy::Map => { + // One map entry: a length-delimited message with key=1, value=2 + // (both string). The descriptor with a map field never decodes (it's + // rejected at validation), but a valid body keeps generation honest. + let mut entry = Vec::new(); + let mut k = Vec::new(); + for _ in 0..u.int_in_range(0usize..=4)? { + k.push(u.int_in_range(0x20u8..=0x7e)?); + } + encode_len_delimited(1, &k, &mut entry); + let mut v = Vec::new(); + for _ in 0..u.int_in_range(0usize..=4)? { + v.push(u.int_in_range(0x20u8..=0x7e)?); + } + encode_len_delimited(2, &v, &mut entry); + encode_len_delimited(n, &entry, out); + } + } + Ok(()) +} + +/// Whether a scalar field type can be encoded in proto3's packed wire form +/// (varint / fixed32 / fixed64 scalars). Strings, bytes, messages, and maps +/// cannot be packed. +fn is_packable(ty: &FieldTy) -> bool { + matches!( + ty, + FieldTy::Int32 + | FieldTy::Int64 + | FieldTy::Uint32 + | FieldTy::Uint64 + | FieldTy::Sint32 + | FieldTy::Sint64 + | FieldTy::Bool + | FieldTy::Enum(_) + | FieldTy::Fixed32 + | FieldTy::Fixed64 + | FieldTy::Sfixed32 + | FieldTy::Sfixed64 + | FieldTy::Float + | FieldTy::Double + ) +} + +/// Encode just the value (no tag) of a packable scalar field, the element form +/// inside a packed repeated field's length-delimited blob. +fn encode_scalar_value( + enum_nvals: &[u8], + f: &FieldDef, + u: &mut Unstructured, + out: &mut Vec, +) -> arbitrary::Result<()> { + match &f.ty { + FieldTy::Int32 => encode_varint(i64::from(u.arbitrary::()?) as u64, out), + FieldTy::Int64 => encode_varint(u.arbitrary::()? as u64, out), + FieldTy::Uint32 => encode_varint(u64::from(u.arbitrary::()?), out), + FieldTy::Uint64 => encode_varint(u.arbitrary::()?, out), + FieldTy::Sint32 => { + let v = u.arbitrary::()?; + encode_varint((((v << 1) ^ (v >> 31)) as u32).into(), out); + } + FieldTy::Sint64 => { + let v = u.arbitrary::()?; + encode_varint(((v << 1) ^ (v >> 63)) as u64, out); + } + FieldTy::Bool => encode_varint(u.int_in_range(0u64..=1)?, out), + FieldTy::Enum(j) => { + let nvals = u64::from(enum_nvals[usize::from(*j)]); + encode_varint(u.int_in_range(0u64..=nvals)?, out); + } + FieldTy::Fixed64 => out.extend_from_slice(&u.arbitrary::()?.to_le_bytes()), + FieldTy::Sfixed64 => out.extend_from_slice(&u.arbitrary::()?.to_le_bytes()), + FieldTy::Double => out.extend_from_slice(&u.arbitrary::()?.to_le_bytes()), + FieldTy::Fixed32 => out.extend_from_slice(&u.arbitrary::()?.to_le_bytes()), + FieldTy::Sfixed32 => out.extend_from_slice(&u.arbitrary::()?.to_le_bytes()), + FieldTy::Float => out.extend_from_slice(&u.arbitrary::()?.to_le_bytes()), + // Unreachable: callers gate on `is_packable`. + _ => {} + } + Ok(()) +} + +/// Encode a valid body for message `M{idx}`. +fn encode_message( + msgs: &[MsgDef], + enum_nvals: &[u8], + idx: usize, + depth: u32, + u: &mut Unstructured, + out: &mut Vec, +) -> arbitrary::Result<()> { + let Some(msg) = msgs.get(idx) else { + return Ok(()); + }; + for f in &msg.fields { + // Repeated fields get 0..=3 occurrences, singular fields 0..=1 + // (proto3 fields are optional). Message fields at the depth limit are + // omitted entirely to break cycles. + let max = if matches!(f.ty, FieldTy::Message(_)) && depth == 0 { + 0 + } else if f.repeated { + 3 + } else { + 1 + }; + let count = u.int_in_range(0u8..=max)?; + // A repeated packable scalar is sometimes emitted in proto3's packed wire + // form (one length-delimited blob of back-to-back values) instead of a + // tag per element, to exercise the packed decode path. (`prost` accepts + // either encoding for a packed field.) + if f.repeated && count > 0 && is_packable(&f.ty) && u.int_in_range(0u8..=1)? == 0 { + let mut packed = Vec::new(); + for _ in 0..count { + encode_scalar_value(enum_nvals, f, u, &mut packed)?; + } + encode_len_delimited(f.number, &packed, out); + } else { + for _ in 0..count { + encode_field(msgs, enum_nvals, f, depth, u, out)?; + } + } + } + Ok(()) +} + +fn run(mut u: Unstructured) -> arbitrary::Result<()> { + let (msgs, enum_nvals) = gen_schema(&mut u)?; + let fds = build_fds(&msgs, &enum_nvals); + + // The decoder's root message is fuzzer-chosen among the generated messages, + // not pinned to M0, so the various field shapes get exercised as the *top* + // message (where columns are derived directly) and not only when nested. + let root = usize::from(u.int_in_range(0u8..=u8::try_from(msgs.len() - 1).unwrap())?); + let root_name = format!("fuzz.M{root}"); + + // Body for the root message: usually a valid encoding (so prost decodes + // through to the Row conversion), but a quarter of the time the raw remaining + // bytes, and otherwise a valid encoding occasionally truncated or single-byte + // corrupted. The non-valid forms keep the decoder's error paths covered: bad + // tag/wire-type, length overrun, unexpected EOF. + let body = if u.int_in_range(0u8..=3)? == 0 { + u.take_rest().to_vec() + } else { + let mut b = Vec::new(); + encode_message(&msgs, &enum_nvals, root, 3, &mut u, &mut b)?; + match u.int_in_range(0u8..=9)? { + 0 if !b.is_empty() => { + let keep = u.int_in_range(0usize..=b.len())?; + b.truncate(keep); + } + 1 if !b.is_empty() => { + let i = u.int_in_range(0usize..=b.len() - 1)?; + b[i] ^= u.arbitrary::()?; + } + _ => {} + } + b + }; + + for confluent_wire_format in [false, true] { + let Ok(descriptors) = DecodedDescriptors::from_bytes(&fds, root_name.clone()) else { + return Ok(()); + }; + let Ok(mut decoder) = Decoder::new(descriptors, confluent_wire_format) else { + return Ok(()); + }; + // The Confluent variant strips a 5-byte schema-registry header (magic + // byte 0 + 4-byte schema id) before the body. Prepend one so its body + // decodes just as deeply as the raw variant. + let payload = if confluent_wire_format { + let mut p = vec![0u8; 5]; + p.extend_from_slice(&body); + p + } else { + body.clone() + }; + let _ = decoder.decode(&payload); + } + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let _ = run(Unstructured::new(data)); +}); diff --git a/src/mysql-util/fuzz/.gitignore b/src/mysql-util/fuzz/.gitignore new file mode 100644 index 0000000000000..ab0eaa1a49031 --- /dev/null +++ b/src/mysql-util/fuzz/.gitignore @@ -0,0 +1,5 @@ +target/ +corpus/ +artifacts/ +coverage/ +Cargo.lock diff --git a/src/mysql-util/fuzz/Cargo.toml b/src/mysql-util/fuzz/Cargo.toml new file mode 100644 index 0000000000000..c03f52c49b947 --- /dev/null +++ b/src/mysql-util/fuzz/Cargo.toml @@ -0,0 +1,32 @@ +# Fuzz crate for mz-mysql-util desc proto round-trip. `MySqlTableDesc` +# describes external-database schemas, so a decoder bug here is reachable +# from a compromised upstream MySQL or from on-disk catalog bytes. +# +# Excluded from the main workspace because libFuzzer requires nightly Rust. + +[package] +workspace = "../../../test/cargo-fuzz" +name = "mz-mysql-util-fuzz" +version = "0.0.0" +publish = false +edition = "2021" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" +mz-mysql-util = { path = "..", features = ["proptest"] } +mz-proto = { path = "../../proto" } +prost = "0.14.3" +proptest = { version = "1.11.0", default-features = false, features = ["std"] } + +[[bin]] +name = "mysql_table_desc_proto_roundtrip" +path = "fuzz_targets/mysql_table_desc_proto_roundtrip.rs" +test = false +doc = false +bench = false + +# The fuzz crate has its own `[workspace]` so it must duplicate the root's +# `[patch.crates-io]`. Keep in sync with the root `Cargo.toml`. diff --git a/src/mysql-util/fuzz/fuzz_targets/mysql_table_desc_proto_roundtrip.rs b/src/mysql-util/fuzz/fuzz_targets/mysql_table_desc_proto_roundtrip.rs new file mode 100644 index 0000000000000..c11cae3e28352 --- /dev/null +++ b/src/mysql-util/fuzz/fuzz_targets/mysql_table_desc_proto_roundtrip.rs @@ -0,0 +1,163 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `ProtoMySqlTableDesc` <-> `MySqlTableDesc` round-trip. +//! Describes external-database schemas, so a decoder bug here is reachable +//! from a compromised upstream MySQL or on-disk catalog bytes. +//! +//! Input generation is split across three arms keyed off the first input +//! byte so a single byte stream exercises all of them over time: +//! +//! 1. **Valid-value arm.** A 32-byte seed (drawn from the input) drives +//! proptest's `Arbitrary for MySqlTableDesc` to build a *structurally +//! valid, deeply-populated* descriptor. Non-empty columns with real +//! `SqlColumnType`s, every `MySqlColumnMeta` variant, and a populated +//! `BTreeSet`. It asserts the canonical +//! `from_proto(into_proto(v)) == v` Rust round-trip, which a +//! random-bytes-only target almost never reaches (random protobuf +//! decodes to near-empty messages). +//! +//! 2. **Duplicate/unsorted-keys arm.** Crafts a `ProtoMySqlTableDesc` +//! whose `keys` field (a repeated proto `Vec`) contains duplicates and +//! out-of-order entries to probe the classic `Vec -> BTreeSet -> Vec` +//! round-trip trap: the *first* decode normalizes (dedups + sorts), so +//! we assert the conversion is a *fixed point* afterwards rather than +//! byte-identical to the crafted input. +//! +//! 3. **Raw-bytes arm.** Decode arbitrary bytes and, if they happen to form a +//! valid descriptor, check the proto round-trip is stable. This guards +//! robustness against the real wire/catalog format. + +#![no_main] + +use libfuzzer_sys::fuzz_target; +use mz_mysql_util::{MySqlKeyDesc, MySqlTableDesc, ProtoMySqlKeyDesc, ProtoMySqlTableDesc}; +use mz_proto::{ProtoType, RustType}; +use proptest::strategy::{Strategy, ValueTree}; +use proptest::test_runner::{Config, RngAlgorithm, TestRng, TestRunner}; +use prost::Message; + +/// Assert that a `MySqlTableDesc` survives a full Rust round-trip through +/// its proto representation unchanged, including a re-encode/decode of the +/// wire bytes. +fn assert_rust_roundtrip(orig: &MySqlTableDesc) { + let proto = orig.into_proto(); + let bytes = proto.encode_to_vec(); + let proto2 = ProtoMySqlTableDesc::decode(bytes.as_slice()) + .expect("re-encode of valid MySqlTableDesc must decode"); + let round: MySqlTableDesc = proto2 + .into_rust() + .expect("re-encoded MySqlTableDesc must convert back to Rust"); + assert_eq!(orig, &round, "MySqlTableDesc changed across proto roundtrip"); +} + +/// Decode `bytes` as a proto, and if it is a valid descriptor, assert the +/// proto round-trip is stable. Used by both the crafted and raw-bytes arms, +/// where the *first* decode may normalize a `Vec`-shaped field into a +/// `BTreeSet`, so we only require idempotence from that normalized value on. +fn check_decoded(bytes: &[u8]) { + let Ok(proto) = ProtoMySqlTableDesc::decode(bytes) else { + return; + }; + let orig: MySqlTableDesc = match proto.into_rust() { + Ok(v) => v, + Err(_) => return, + }; + assert_rust_roundtrip(&orig); +} + +/// Build a proto whose `keys` Vec deliberately violates the `BTreeSet` +/// invariants (duplicates and reverse order), seeded from `data` so the +/// fuzzer can vary the contents while keeping the shape pathological. +fn craft_unsorted_dup_keys(data: &[u8]) -> ProtoMySqlTableDesc { + // Derive a couple of distinct key bodies from the seed bytes. + let pick = |i: usize| -> String { + let b = data.get(i).copied().unwrap_or(i as u8); + format!("k{}", b % 5) + }; + let key_a = ProtoMySqlKeyDesc { + name: pick(0), + is_primary: data.first().copied().unwrap_or(0) & 1 == 0, + columns: vec![pick(1), pick(2)], + }; + let key_b = ProtoMySqlKeyDesc { + name: pick(3), + is_primary: data.get(1).copied().unwrap_or(0) & 1 == 0, + columns: vec![pick(4)], + }; + // Emit duplicates and in deliberately non-sorted order. The decoder + // collapses these into a BTreeSet, which is the round-trip trap. + ProtoMySqlTableDesc { + name: "fuzz".to_string(), + schema_name: "fuzz".to_string(), + columns: vec![], + keys: vec![key_b.clone(), key_a.clone(), key_a.clone(), key_b], + } +} + +fuzz_target!(|data: &[u8]| { + // Reserve the first byte as a mode selector and the next 32 bytes as the + // proptest seed. Everything after that feeds the raw-bytes / crafting + // logic so a single input can drive any arm. + let mode = data.first().copied().unwrap_or(0); + let mut seed = [0u8; 32]; + let seed_src = data.get(1..33).unwrap_or(&[]); + seed[..seed_src.len()].copy_from_slice(seed_src); + let rest = data.get(33..).unwrap_or(&[]); + + match mode % 3 { + 0 => { + // Valid-value arm: drive proptest's Arbitrary from the seed. + let mut runner = TestRunner::new_with_rng( + Config::default(), + TestRng::from_seed(RngAlgorithm::ChaCha, &seed), + ); + let value = match ::arbitrary() + .new_tree(&mut runner) + { + Ok(tree) => tree.current(), + Err(_) => return, + }; + assert_rust_roundtrip(&value); + + // The proptest-built value is already normalized (its keys come + // from a BTreeSet). Sanity-check that the BTreeSet semantics hold + // after a wire decode by encoding and re-decoding, then confirm + // re-collecting the keys into a fresh set is idempotent. + let decoded: MySqlTableDesc = value + .into_proto() + .encode_to_vec() + .as_slice() + .pipe(ProtoMySqlTableDesc::decode) + .expect("decode") + .into_rust() + .expect("into_rust"); + let recollected: std::collections::BTreeSet = + decoded.keys.iter().cloned().collect(); + assert_eq!(decoded.keys, recollected, "key set not idempotent"); + } + 1 => { + // Duplicate/unsorted-keys arm. + let proto = craft_unsorted_dup_keys(rest); + check_decoded(proto.encode_to_vec().as_slice()); + } + _ => { + // Raw-bytes arm: decode arbitrary bytes directly. + check_decoded(rest); + } + } +}); + +/// Tiny extension trait so the valid-value arm can read top-to-bottom. +trait Pipe: Sized { + fn pipe(self, f: impl FnOnce(Self) -> R) -> R { + f(self) + } +} +impl Pipe for T {} diff --git a/src/persist-client/Cargo.toml b/src/persist-client/Cargo.toml index 7e3c04307dd30..3a8e5d0586d9d 100644 --- a/src/persist-client/Cargo.toml +++ b/src/persist-client/Cargo.toml @@ -81,5 +81,8 @@ tonic-prost-build.workspace = true [features] default = ["mz-build-tools/default"] +# Re-exports internal durable-state types (see `fuzz_exports`) for the +# `src/persist-client/fuzz` cargo-fuzz crate. Not for production use. +fuzzing = [] turmoil = [] foundationdb = ["mz-persist/foundationdb"] diff --git a/src/persist-client/fuzz/.gitignore b/src/persist-client/fuzz/.gitignore new file mode 100644 index 0000000000000..ab0eaa1a49031 --- /dev/null +++ b/src/persist-client/fuzz/.gitignore @@ -0,0 +1,5 @@ +target/ +corpus/ +artifacts/ +coverage/ +Cargo.lock diff --git a/src/persist-client/fuzz/Cargo.toml b/src/persist-client/fuzz/Cargo.toml new file mode 100644 index 0000000000000..d5e32ca3e50b9 --- /dev/null +++ b/src/persist-client/fuzz/Cargo.toml @@ -0,0 +1,37 @@ +# Fuzz crate for mz-persist-client: decode the durable consensus/blob state. +# `ProtoRollup` (a full state snapshot) and `ProtoStateDiff` (an incremental +# update) are decoded on every state load. A corrupted or crafted blob that +# panics the decoder poisons the shard, so decode must never panic and valid +# values must survive a proto re-encode round trip. +# +# Excluded from the main workspace because libFuzzer requires nightly Rust. + +[package] +workspace = "../../../test/cargo-fuzz" +name = "mz-persist-client-fuzz" +version = "0.0.0" +publish = false +edition = "2021" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" +mz-persist-client = { path = "..", features = ["fuzzing"] } +mz-proto = { path = "../../proto" } +prost = "0.14.3" + +[[bin]] +name = "state_diff_proto_roundtrip" +path = "fuzz_targets/state_diff_proto_roundtrip.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "rollup_proto_roundtrip" +path = "fuzz_targets/rollup_proto_roundtrip.rs" +test = false +doc = false +bench = false diff --git a/src/persist-client/fuzz/fuzz_targets/rollup_proto_roundtrip.rs b/src/persist-client/fuzz/fuzz_targets/rollup_proto_roundtrip.rs new file mode 100644 index 0000000000000..2eb1b38d52892 --- /dev/null +++ b/src/persist-client/fuzz/fuzz_targets/rollup_proto_roundtrip.rs @@ -0,0 +1,339 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: exercises `ProtoRollup` decoding and the `Rollup` +//! `from_proto` conversion. A rollup is a full state snapshot read from blob on +//! load, so a decoder panic on a corrupted/crafted blob makes the shard +//! unrecoverable. +//! +//! Decoding *random* bytes as a protobuf almost never yields a `ProtoRollup` +//! that survives `into_rust`: the conversion needs a well-formed shard id +//! (`s` + UUID), a present `trace`, and, most interestingly, inlined `diffs` +//! whose `lower`/`upper` bounds line up with the latest rollup's seqno and the +//! state's seqno. Random input plateaus long before reaching those cross-field +//! invariant checks, so this target hand-builds a *structurally valid* +//! `ProtoRollup` on the protobuf wire from fuzzer-chosen parameters. The first +//! byte selects a mode: +//! +//! * mode 0: feed the remaining bytes straight to `ProtoRollup::decode` +//! (the robustness arm: must never panic, and any value that +//! converts must survive a proto re-encode round trip losslessly). +//! * mode 1: synthesize a valid rollup *with* inlined diffs whose bounds +//! satisfy the invariants, decode it (the happy path that the invariant +//! checks must accept), and round-trip it. +//! * mode 2: synthesize a valid rollup, then *perturb a single invariant +//! field* (drop the rollups map, or shift `diffs.lower`/`diffs.upper` off +//! the expected seqno). `from_proto` must reject it with `Err`, never panic. + +#![no_main] + +use libfuzzer_sys::fuzz_target; +use mz_persist_client::fuzz_exports::{ProtoRollup, Rollup}; +use mz_proto::{ProtoType, RustType}; +use prost::Message; + +// --- Minimal protobuf wire-format writer --------------------------------- +// +// We only need the few wire types used by `ProtoRollup` and its (private, +// not-re-exported) nested messages. Building the bytes by hand lets us feed +// the *real* `ProtoRollup::decode` a structurally valid message without naming +// types that the `fuzz` feature does not export. + +const WIRE_VARINT: u32 = 0; +const WIRE_LEN: u32 = 2; + +fn put_varint(buf: &mut Vec, mut v: u64) { + loop { + let mut byte = (v & 0x7f) as u8; + v >>= 7; + if v != 0 { + byte |= 0x80; + } + buf.push(byte); + if v == 0 { + break; + } + } +} + +fn put_key(buf: &mut Vec, tag: u32, wire: u32) { + put_varint(buf, ((tag as u64) << 3) | (wire as u64)); +} + +/// Writes a varint (`uint64`/`int64`/`enum`) field. +fn put_uint(buf: &mut Vec, tag: u32, v: u64) { + put_key(buf, tag, WIRE_VARINT); + put_varint(buf, v); +} + +/// Writes a length-delimited field (string/bytes/sub-message). +fn put_bytes(buf: &mut Vec, tag: u32, payload: &[u8]) { + put_key(buf, tag, WIRE_LEN); + put_varint(buf, payload.len() as u64); + buf.extend_from_slice(payload); +} + +// --- Pull-style reader over the fuzzer's parameter bytes ----------------- + +struct Unstructured<'a> { + bytes: &'a [u8], + pos: usize, +} + +impl<'a> Unstructured<'a> { + fn new(bytes: &'a [u8]) -> Self { + Unstructured { bytes, pos: 0 } + } + + fn u8(&mut self) -> u8 { + let b = self.bytes.get(self.pos).copied().unwrap_or(0); + self.pos += 1; + b + } + + fn u64(&mut self) -> u64 { + let mut v = 0u64; + for _ in 0..8 { + v = (v << 8) | u64::from(self.u8()); + } + v + } + + /// A small count in `[0, max]`, biased toward the low end. + fn count(&mut self, max: usize) -> usize { + if max == 0 { + return 0; + } + usize::from(self.u8()) % (max + 1) + } +} + +// --- Sub-message builders ------------------------------------------------- + +/// `ProtoU64Antichain { repeated int64 elements = 1; }` +fn antichain(elements: &[u64]) -> Vec { + let mut buf = Vec::new(); + for e in elements { + put_uint(&mut buf, 1, *e); + } + buf +} + +/// `ProtoTrace { ProtoU64Antichain since = 1; ... }`. An empty trace (no +/// batches) with just a `since` antichain, which unflattens cleanly. +fn empty_trace(since: &[u64]) -> Vec { + let mut buf = Vec::new(); + put_bytes(&mut buf, 1, &antichain(since)); + buf +} + +/// `ProtoHollowRollup { string key = 1; optional uint64 encoded_size_bytes = 2; }` +fn hollow_rollup(key: &str, encoded_size_bytes: Option) -> Vec { + let mut buf = Vec::new(); + put_bytes(&mut buf, 1, key.as_bytes()); + if let Some(sz) = encoded_size_bytes { + put_uint(&mut buf, 2, sz); + } + buf +} + +/// One entry of `map rollups = 16;` +fn rollups_entry(seqno: u64, value: &[u8]) -> Vec { + let mut buf = Vec::new(); + put_uint(&mut buf, 1, seqno); + put_bytes(&mut buf, 2, value); + buf +} + +/// `ProtoVersionedData { uint64 seqno = 1; bytes data = 2; }` +fn versioned_data(seqno: u64, data: &[u8]) -> Vec { + let mut buf = Vec::new(); + put_uint(&mut buf, 1, seqno); + put_bytes(&mut buf, 2, data); + buf +} + +/// `ProtoInlinedDiffs { uint64 lower = 1; uint64 upper = 2; repeated ProtoVersionedData diffs = 3; }` +fn inlined_diffs(lower: u64, upper: u64, diffs: &[(u64, Vec)]) -> Vec { + let mut buf = Vec::new(); + put_uint(&mut buf, 1, lower); + put_uint(&mut buf, 2, upper); + for (seqno, data) in diffs { + put_bytes(&mut buf, 3, &versioned_data(*seqno, data)); + } + buf +} + +/// Which invariant to break in mode 2. +enum Mutation { + DropRollups, + ShiftLower, + ShiftUpper, +} + +/// Builds a structurally valid `ProtoRollup` wire encoding driven by `u`. +/// +/// When `mutate` is `Some`, the named invariant is intentionally broken so the +/// caller can assert `from_proto` rejects it. +fn build_rollup(u: &mut Unstructured, mutate: Option) -> Vec { + // A fixed, parseable shard id. The interesting surface is the diff bounds, + // not shard-id parsing (covered by the raw-bytes arm). + const SHARD_ID: &str = "s00000000-0000-0000-0000-000000000000"; + + let mut buf = Vec::new(); + + // shard_id = 1 + put_bytes(&mut buf, 1, SHARD_ID.as_bytes()); + // key_codec = 2, val_codec = 3, ts_codec = 4, diff_codec = 5 (free-form + // strings on the rollup path, so pick fixed values). + put_bytes(&mut buf, 2, b"()"); + put_bytes(&mut buf, 3, b"()"); + put_bytes(&mut buf, 4, b"u64"); + put_bytes(&mut buf, 5, b"i64"); + + // Choose rollup seqnos. `latest_rollup` is the max key in the map, and the + // state seqno must be >= it so the diff range is non-empty/well-formed. + let num_rollups = u.count(3).max(1); + let mut rollup_seqnos: Vec = Vec::with_capacity(num_rollups); + let mut next = 1 + (u.u64() % 1000); + for _ in 0..num_rollups { + rollup_seqnos.push(next); + next += 1 + (u.u64() % 1000); + } + let latest_rollup_seqno = *rollup_seqnos.iter().max().expect("non-empty"); + // state.seqno >= latest rollup seqno. + let state_seqno = latest_rollup_seqno + (u.u64() % 1000); + + let drop_rollups = matches!(mutate, Some(Mutation::DropRollups)); + if !drop_rollups { + for (i, seqno) in rollup_seqnos.iter().enumerate() { + let key = format!("rollup-key-{i}"); + let sz = if u.u8() & 1 == 0 { + Some(u.u64() % 1_000_000) + } else { + None + }; + put_bytes(&mut buf, 16, &rollups_entry(*seqno, &hollow_rollup(&key, sz))); + } + } + + // seqno = 6 + put_uint(&mut buf, 6, state_seqno); + // trace = 7 (required: `into_rust_if_some("trace")`). + put_bytes(&mut buf, 7, &empty_trace(&[u.u64() % 1000])); + // last_gc_req = 10 + put_uint(&mut buf, 10, u.u64() % 1000); + // applier_version = 11 (parseable semver, or empty for the "infinitely old" + // backward-compat path). + if u.u8() & 1 == 0 { + put_bytes(&mut buf, 11, b"0.1.2"); + } + // hostname = 14 + put_bytes(&mut buf, 14, b"fuzz-host"); + // walltime_ms = 15 + put_uint(&mut buf, 15, u.u64()); + + // Optionally attach inlined diffs with invariant-satisfying bounds. + // + // The decoder requires: + // diffs.lower == latest_rollup_seqno + 1 + // diffs.upper == state_seqno + 1 + // (`SeqNo::next` is `+ 1`). Each inlined `VersionedData` must sit within + // `[lower, upper)`. Bounds become degenerate when `state_seqno == + // latest_rollup_seqno`, in which case there is no valid diff seqno. + let want_diffs = mutate.is_some() || (u.u8() & 1 == 0); + if want_diffs { + let mut lower = latest_rollup_seqno + 1; + let mut upper = state_seqno + 1; + match &mutate { + Some(Mutation::ShiftLower) => lower = lower.wrapping_add(1 + (u.u64() % 5)), + Some(Mutation::ShiftUpper) => upper = upper.wrapping_add(1 + (u.u64() % 5)), + _ => {} + } + + // Pick in-range diff seqnos only for the un-mutated/drop-rollups cases. + // `from_proto` does not bound-check individual diff seqnos, so this is + // about producing realistic content rather than satisfying an invariant. + let mut diffs: Vec<(u64, Vec)> = Vec::new(); + if state_seqno > latest_rollup_seqno { + let span = upper.saturating_sub(lower).min(4); + let n = if span == 0 { 0 } else { u.count(span as usize) }; + for i in 0..n { + let seqno = lower + (i as u64); + diffs.push((seqno, vec![u.u8(), u.u8(), u.u8()])); + } + } + put_bytes(&mut buf, 17, &inlined_diffs(lower, upper, &diffs)); + } + + buf +} + +fn roundtrip(proto: ProtoRollup) { + let orig: Rollup = match proto.into_rust() { + Ok(v) => v, + Err(_) => return, + }; + + let proto2: ProtoRollup = orig.into_proto(); + let bytes2 = proto2.encode_to_vec(); + let proto3 = ProtoRollup::decode(bytes2.as_slice()) + .expect("re-encode of valid Rollup must decode"); + let round: Rollup = proto3 + .into_rust() + .expect("re-encoded Rollup must convert back to Rust"); + let proto4: ProtoRollup = round.into_proto(); + + assert_eq!(proto2, proto4, "Rollup changed across proto roundtrip"); +} + +fuzz_target!(|data: &[u8]| { + let (mode, rest) = match data.split_first() { + Some((m, r)) => (*m, r), + None => return, + }; + + match mode % 3 { + 0 => { + // Robustness arm: arbitrary bytes must never panic the decoder, and + // anything that converts must round-trip. + let Ok(proto) = ProtoRollup::decode(rest) else { + return; + }; + roundtrip(proto); + } + 1 => { + // Valid-rollup arm: the synthesized message satisfies the diff + // invariants and must decode + round-trip. + let mut u = Unstructured::new(rest); + let bytes = build_rollup(&mut u, None); + let proto = + ProtoRollup::decode(bytes.as_slice()).expect("hand-built ProtoRollup must decode"); + roundtrip(proto); + } + _ => { + // Invariant-violation arm: break exactly one invariant and require + // `from_proto` to reject (not panic). + let mut u = Unstructured::new(rest); + let mutation = match u.u8() % 3 { + 0 => Mutation::DropRollups, + 1 => Mutation::ShiftLower, + _ => Mutation::ShiftUpper, + }; + let bytes = build_rollup(&mut u, Some(mutation)); + let proto = ProtoRollup::decode(bytes.as_slice()) + .expect("hand-built ProtoRollup must decode"); + let result: Result, _> = proto.into_rust(); + assert!( + result.is_err(), + "Rollup with a broken diff-bounds invariant must be rejected by from_proto" + ); + } + } +}); diff --git a/src/persist-client/fuzz/fuzz_targets/state_diff_proto_roundtrip.rs b/src/persist-client/fuzz/fuzz_targets/state_diff_proto_roundtrip.rs new file mode 100644 index 0000000000000..c3ca90b360095 --- /dev/null +++ b/src/persist-client/fuzz/fuzz_targets/state_diff_proto_roundtrip.rs @@ -0,0 +1,395 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: exercises `ProtoStateDiff` decoding and the `StateDiff` +//! `from_proto` conversion. State diffs are read from consensus on every state +//! update, so a decoder panic on a corrupted/crafted blob poisons the shard. +//! +//! The interesting surface is `ProtoStateDiff::field_diffs`, a *columnar* +//! encoding (`ProtoStateFieldDiffs`): parallel `fields`/`diff_types` arrays plus +//! a `data_lens`/`data_bytes` blob holding the per-diff key and value slices. +//! `from_proto` first `validate()`s that the slice counts and byte lengths are +//! self-consistent, then iterates, slicing `data_bytes` and decoding each slice +//! as the sub-proto for that field. Decoding *random* bytes as a protobuf +//! essentially never produces a `field_diffs` that survives `validate()` (let +//! alone one whose slices decode), so the columnar reader is never exercised. +//! +//! This target therefore hand-builds the columnar encoding on the protobuf wire +//! from fuzzer-chosen parameters. The first byte selects a mode: +//! +//! * mode 0: feed the remaining bytes straight to `ProtoStateDiff::decode` +//! (the robustness arm: must never panic, and any value that +//! converts must survive a proto re-encode round trip losslessly). +//! * mode 1: synthesize a *valid* columnar diff over a mix of fields and +//! insert/update/delete diff types, with self-consistent slice counts and +//! lengths, then decode + round-trip it. +//! * mode 2: synthesize a diff whose columnar bookkeeping is *inconsistent* +//! (slice count or byte length mismatch, or an unknown field/diff-type enum). +//! `from_proto`'s `validate()`/iter must reject it with `Err`, never panic. + +#![no_main] + +use libfuzzer_sys::fuzz_target; +use mz_persist_client::fuzz_exports::{ProtoStateDiff, StateDiff}; +use mz_proto::{ProtoType, RustType}; +use prost::Message; + +// --- Minimal protobuf wire-format writer --------------------------------- +// +// Building the bytes by hand lets us feed the *real* `ProtoStateDiff::decode` a +// well-formed columnar message without naming the nested proto types that the +// `fuzz` feature does not re-export. + +const WIRE_VARINT: u32 = 0; +const WIRE_LEN: u32 = 2; + +fn put_varint(buf: &mut Vec, mut v: u64) { + loop { + let mut byte = (v & 0x7f) as u8; + v >>= 7; + if v != 0 { + byte |= 0x80; + } + buf.push(byte); + if v == 0 { + break; + } + } +} + +fn put_key(buf: &mut Vec, tag: u32, wire: u32) { + put_varint(buf, ((tag as u64) << 3) | (wire as u64)); +} + +/// Writes a varint (`uint64`/`int64`/`enum`) field. +fn put_uint(buf: &mut Vec, tag: u32, v: u64) { + put_key(buf, tag, WIRE_VARINT); + put_varint(buf, v); +} + +/// Writes a length-delimited field (string/bytes/sub-message). +fn put_bytes(buf: &mut Vec, tag: u32, payload: &[u8]) { + put_key(buf, tag, WIRE_LEN); + put_varint(buf, payload.len() as u64); + buf.extend_from_slice(payload); +} + +// --- Pull-style reader over the fuzzer's parameter bytes ----------------- + +struct Unstructured<'a> { + bytes: &'a [u8], + pos: usize, +} + +impl<'a> Unstructured<'a> { + fn new(bytes: &'a [u8]) -> Self { + Unstructured { bytes, pos: 0 } + } + + fn u8(&mut self) -> u8 { + let b = self.bytes.get(self.pos).copied().unwrap_or(0); + self.pos += 1; + b + } + + fn u64(&mut self) -> u64 { + let mut v = 0u64; + for _ in 0..8 { + v = (v << 8) | u64::from(self.u8()); + } + v + } + + /// A small count in `[lo, hi]`, biased toward the low end. + fn range(&mut self, lo: usize, hi: usize) -> usize { + if hi <= lo { + return lo; + } + lo + (usize::from(self.u8()) % (hi - lo + 1)) + } +} + +// --- Trivial-message encoders -------------------------------------------- +// +// prost encodes scalar "trivial messages" with the value at field tag 1 +// (`u64`/`i64` as a varint, `String`/`bytes` as length-delimited). The default +// value (0 / empty) encodes to *zero* bytes, so an empty slice is a valid +// encoding of every key/value type the diff reader decodes. + +/// Encoded slice for a `u64` trivial message. +fn enc_u64(v: u64) -> Vec { + let mut buf = Vec::new(); + if v != 0 { + put_uint(&mut buf, 1, v); + } + buf +} + +/// Encoded slice for a `String` trivial message. +fn enc_string(s: &str) -> Vec { + let mut buf = Vec::new(); + if !s.is_empty() { + put_bytes(&mut buf, 1, s.as_bytes()); + } + buf +} + +/// Encoded slice for `ProtoU64Antichain { repeated int64 elements = 1; }`. +fn enc_antichain(elements: &[u64]) -> Vec { + let mut buf = Vec::new(); + for e in elements { + put_uint(&mut buf, 1, *e); + } + buf +} + +/// Encoded slice for `ProtoHollowRollup { string key = 1; optional uint64 encoded_size_bytes = 2; }`. +fn enc_hollow_rollup(key: &str, encoded_size_bytes: Option) -> Vec { + let mut buf = Vec::new(); + if !key.is_empty() { + put_bytes(&mut buf, 1, key.as_bytes()); + } + if let Some(sz) = encoded_size_bytes { + put_uint(&mut buf, 2, sz); + } + buf +} + +/// Encoded slice for `ProtoActiveRollup`/`ProtoActiveGC { uint64 seqno = 1; uint64 start_ms = 2; }`. +fn enc_active(seqno: u64, start_ms: u64) -> Vec { + let mut buf = Vec::new(); + if seqno != 0 { + put_uint(&mut buf, 1, seqno); + } + if start_ms != 0 { + put_uint(&mut buf, 2, start_ms); + } + buf +} + +// ProtoStateField enum values (see diff.proto). +const FIELD_LAST_GC_REQ: u64 = 1; +const FIELD_SINCE: u64 = 4; +const FIELD_HOSTNAME: u64 = 7; +const FIELD_ROLLUPS: u64 = 8; +const FIELD_ACTIVE_ROLLUP: u64 = 13; +const FIELD_ACTIVE_GC: u64 = 14; + +// ProtoStateFieldDiffType enum values. +const DIFF_INSERT: u64 = 0; +const DIFF_UPDATE: u64 = 1; +const DIFF_DELETE: u64 = 2; + +/// A single columnar diff: the field, the diff type, and the already-encoded +/// key/value slices (1 key + 1 or 2 vals, matching the diff type). +struct ColumnarDiff { + field: u64, + diff_type: u64, + /// key slice followed by 1 (insert/delete) or 2 (update) value slices. + slices: Vec>, +} + +/// Picks a field + diff type + correctly-shaped key/value slices. +/// +/// All fields here decode losslessly from the encoded slices (scalar/simple +/// sub-protos with no required nested fields), so the un-mutated diff survives +/// a full proto round trip. Batch/reader/writer/schema fields are intentionally +/// avoided: their sub-protos have required nested messages that would not +/// decode from an empty/default slice. +fn gen_diff(u: &mut Unstructured) -> ColumnarDiff { + let diff_type = match u.u8() % 3 { + 0 => DIFF_INSERT, + 1 => DIFF_UPDATE, + _ => DIFF_DELETE, + }; + let num_vals = if diff_type == DIFF_UPDATE { 2 } else { 1 }; + + // (field, key slice, value-slice generator). + let (field, key): (u64, Vec) = match u.u8() % 6 { + 0 => (FIELD_HOSTNAME, Vec::new()), // key () + 1 => (FIELD_LAST_GC_REQ, Vec::new()), // key () + 2 => (FIELD_SINCE, Vec::new()), // key () + 3 => (FIELD_ROLLUPS, enc_u64(u.u64() % 10_000)), // key u64 (SeqNo) + 4 => (FIELD_ACTIVE_ROLLUP, Vec::new()), // key () + _ => (FIELD_ACTIVE_GC, Vec::new()), // key () + }; + + let mut slices = Vec::with_capacity(1 + num_vals); + slices.push(key); + for _ in 0..num_vals { + let val = match field { + FIELD_HOSTNAME => enc_string(&format!("host-{}", u.u8())), + FIELD_LAST_GC_REQ => enc_u64(u.u64() % 10_000), + FIELD_SINCE => { + let n = u.range(0, 2); + let elems: Vec = (0..n).map(|_| u.u64() % 10_000).collect(); + enc_antichain(&elems) + } + FIELD_ROLLUPS => { + let sz = if u.u8() & 1 == 0 { + Some(u.u64() % 1_000_000) + } else { + None + }; + enc_hollow_rollup(&format!("rollup-{}", u.u8()), sz) + } + FIELD_ACTIVE_ROLLUP | FIELD_ACTIVE_GC => { + enc_active(u.u64() % 10_000, u.u64() % 1_000_000) + } + _ => Vec::new(), + }; + slices.push(val); + } + + ColumnarDiff { + field, + diff_type, + slices, + } +} + +/// Serializes `ProtoStateFieldDiffs { fields, diff_types, data_lens, data_bytes }` +/// from a list of columnar diffs. +fn encode_field_diffs(diffs: &[ColumnarDiff]) -> Vec { + let mut fields = Vec::new(); + let mut diff_types = Vec::new(); + let mut data_lens = Vec::new(); + let mut data_bytes: Vec = Vec::new(); + + for d in diffs { + put_uint(&mut fields, 1, d.field); + put_uint(&mut diff_types, 2, d.diff_type); + for slice in &d.slices { + put_uint(&mut data_lens, 3, slice.len() as u64); + data_bytes.extend_from_slice(slice); + } + } + + let mut buf = Vec::new(); + buf.extend_from_slice(&fields); + buf.extend_from_slice(&diff_types); + buf.extend_from_slice(&data_lens); + if !data_bytes.is_empty() { + put_bytes(&mut buf, 4, &data_bytes); + } + buf +} + +/// Wraps a `field_diffs` payload into a full `ProtoStateDiff`. +fn encode_state_diff(u: &mut Unstructured, field_diffs: &[u8]) -> Vec { + let mut buf = Vec::new(); + // applier_version = 1 (parseable semver, or empty for the backward-compat path). + if u.u8() & 1 == 0 { + put_bytes(&mut buf, 1, b"0.1.2"); + } + // seqno_from = 2, seqno_to = 3 + put_uint(&mut buf, 2, u.u64() % 10_000); + put_uint(&mut buf, 3, u.u64() % 10_000); + // latest_rollup_key = 4 (free-form string) + put_bytes(&mut buf, 4, b"rollup-key"); + // field_diffs = 5 + put_bytes(&mut buf, 5, field_diffs); + // walltime_ms = 6 + put_uint(&mut buf, 6, u.u64()); + buf +} + +fn roundtrip(proto: ProtoStateDiff) { + let orig: StateDiff = match proto.into_rust() { + Ok(v) => v, + Err(_) => return, + }; + + let proto2: ProtoStateDiff = orig.into_proto(); + let bytes2 = proto2.encode_to_vec(); + let proto3 = ProtoStateDiff::decode(bytes2.as_slice()) + .expect("re-encode of valid StateDiff must decode"); + let round: StateDiff = proto3 + .into_rust() + .expect("re-encoded StateDiff must convert back to Rust"); + let proto4: ProtoStateDiff = round.into_proto(); + + assert_eq!(proto2, proto4, "StateDiff changed across proto roundtrip"); +} + +fuzz_target!(|data: &[u8]| { + let (mode, rest) = match data.split_first() { + Some((m, r)) => (*m, r), + None => return, + }; + + match mode % 3 { + 0 => { + // Robustness arm: arbitrary bytes must never panic the decoder, and + // anything that converts must round-trip. + let Ok(proto) = ProtoStateDiff::decode(rest) else { + return; + }; + roundtrip(proto); + } + 1 => { + // Valid columnar arm: a self-consistent set of field diffs that must + // decode + round-trip. + let mut u = Unstructured::new(rest); + let num_diffs = u.range(0, 6); + let diffs: Vec = (0..num_diffs).map(|_| gen_diff(&mut u)).collect(); + let field_diffs = encode_field_diffs(&diffs); + let bytes = encode_state_diff(&mut u, &field_diffs); + let proto = ProtoStateDiff::decode(bytes.as_slice()) + .expect("hand-built ProtoStateDiff must decode"); + roundtrip(proto); + } + _ => { + // Inconsistent-columnar arm: build a valid set, then corrupt the + // bookkeeping so `validate()`/iter must reject it (not panic). + let mut u = Unstructured::new(rest); + let num_diffs = u.range(1, 6); + let mut diffs: Vec = + (0..num_diffs).map(|_| gen_diff(&mut u)).collect(); + + match u.u8() % 4 { + 0 => { + // Drop a value slice: data_lens count no longer matches the + // count implied by diff_types. + if let Some(last) = diffs.last_mut() { + last.slices.pop(); + } + } + 1 => { + // Append a spurious data slice with no corresponding diff. + if let Some(last) = diffs.last_mut() { + last.slices.push(vec![0xff; u.range(1, 4)]); + } + } + 2 => { + // Unknown field enum value. + if let Some(last) = diffs.last_mut() { + last.field = 9999; + } + } + _ => { + // Unknown diff-type enum value. + if let Some(last) = diffs.last_mut() { + last.diff_type = 9999; + } + } + } + + let field_diffs = encode_field_diffs(&diffs); + let bytes = encode_state_diff(&mut u, &field_diffs); + let proto = ProtoStateDiff::decode(bytes.as_slice()) + .expect("hand-built ProtoStateDiff must decode"); + let result: Result, _> = proto.into_rust(); + assert!( + result.is_err(), + "StateDiff with inconsistent columnar field_diffs must be rejected by from_proto" + ); + } + } +}); diff --git a/src/persist-client/src/lib.rs b/src/persist-client/src/lib.rs index 342fba325252e..6275edb7b182d 100644 --- a/src/persist-client/src/lib.rs +++ b/src/persist-client/src/lib.rs @@ -9,7 +9,13 @@ //! An abstraction presenting as a durable time-varying collection (aka shard) -#![warn(missing_docs, missing_debug_implementations)] +// The `fuzzing` feature re-exports internal types (see `fuzz_exports`) that are +// intentionally undocumented. Don't require docs/Debug for them in that +// test-only build. The normal public API is still linted. +#![cfg_attr( + not(feature = "fuzzing"), + warn(missing_docs, missing_debug_implementations) +)] // #[track_caller] is currently a no-op on async functions, but that hopefully won't be the case // forever. So we already annotate those functions now and ignore the compiler warning until // https://github.com/rust-lang/rust/issues/87417 pans out. @@ -98,6 +104,17 @@ pub mod stats; pub mod usage; pub mod write; +/// Internal durable-state types re-exported under `cfg(feature = "fuzzing")` so +/// the fuzz crate can drive their proto round-trips (`ProtoRollup`/`ProtoStateDiff` +/// are decoded from blob/consensus on every state load). Not part of the public +/// API. +#[cfg(feature = "fuzzing")] +pub mod fuzz_exports { + pub use crate::internal::encoding::Rollup; + pub use crate::internal::state::{ProtoRollup, ProtoStateDiff}; + pub use crate::internal::state_diff::StateDiff; +} + /// An implementation of the public crate interface. mod internal { pub mod apply; diff --git a/src/pgcopy/fuzz/.gitignore b/src/pgcopy/fuzz/.gitignore new file mode 100644 index 0000000000000..ab0eaa1a49031 --- /dev/null +++ b/src/pgcopy/fuzz/.gitignore @@ -0,0 +1,5 @@ +target/ +corpus/ +artifacts/ +coverage/ +Cargo.lock diff --git a/src/pgcopy/fuzz/Cargo.toml b/src/pgcopy/fuzz/Cargo.toml new file mode 100644 index 0000000000000..85351f06afc10 --- /dev/null +++ b/src/pgcopy/fuzz/Cargo.toml @@ -0,0 +1,29 @@ +# Fuzz crate for mz-pgcopy: decode untrusted `COPY ... FROM` data. COPY input +# arrives verbatim from a client, and the text/CSV field splitting plus the +# per-type value decoding are hand-written byte/text parsers sitting at the +# trust boundary, so any panic/SEGV reachable from the data is an availability +# bug. +# +# Excluded from the main workspace because libFuzzer requires nightly Rust. + +[package] +workspace = "../../../test/cargo-fuzz" +name = "mz-pgcopy-fuzz" +version = "0.0.0" +publish = false +edition = "2021" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" +mz-pgcopy = { path = ".." } +mz-pgrepr = { path = "../../pgrepr" } + +[[bin]] +name = "copy_decode" +path = "fuzz_targets/copy_decode.rs" +test = false +doc = false +bench = false diff --git a/src/pgcopy/fuzz/fuzz_targets/copy_decode.rs b/src/pgcopy/fuzz/fuzz_targets/copy_decode.rs new file mode 100644 index 0000000000000..fe9e1bf426d98 --- /dev/null +++ b/src/pgcopy/fuzz/fuzz_targets/copy_decode.rs @@ -0,0 +1,297 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `mz_pgcopy::decode_copy_format` parses untrusted `COPY … FROM` +//! data (text and CSV) into rows. COPY data comes straight from a client, and +//! the field splitting plus per-type value decoding are hand-written parsers at +//! the trust boundary, so any panic is an availability bug. +//! +//! The field splitter accepts almost any bytes, but the per-type value decoders +//! (int/float/date/jsonb/uuid/bytea) reject random field text immediately, so +//! raw bytes exercise the line/field framing but barely reach the value +//! decoders. We instead consume the byte stream as grammar choices and emit a +//! valid row set for the fixed 9-column schema so each value decoder runs +//! through to its success path. Generation is tailored per format: +//! +//! * Text: per-type values delimiter-joined with `\N` NULL tokens, and the +//! text column sometimes carries real COPY-text escape sequences (`a\tb`, +//! `\x41`, `\052`, embedded `\n`, `\b\f\v\r`) so the `consume_raw_value` +//! backslash/hex/octal escape decoder runs end to end (its decoded bytes +//! stay valid UTF-8 so the Text decode still succeeds). +//! * CSV: the format params themselves are fuzzed the way the product's +//! `impl Arbitrary for CopyCsvFormatParams` does it (quote derived to differ +//! from the delimiter, an optional distinct escape, optional header, +//! optional non-empty NULL token). Every field that could contain the +//! active delimiter/quote/escape (notably the text column and the NULL +//! token) is properly quoted and escaped so it survives the csv-core framing +//! layer and reaches the value decoder instead of erroring early. When a +//! header is configured we additionally emit a leading header row so the +//! header-skip path is exercised alongside data rows. +//! +//! A quarter of inputs feed the raw bytes through both formats so the +//! framing/error paths stay covered. + +#![no_main] + +use std::borrow::Cow; + +use libfuzzer_sys::arbitrary::{self, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_pgcopy::{CopyCsvFormatParams, CopyFormatParams, CopyTextFormatParams, decode_copy_format}; +use mz_pgrepr::Type; + +/// A fixed, type-diverse schema exercising many per-field value decoders. +const COLS: [Type; 9] = [ + Type::Int4, + Type::Text, + Type::Bool, + Type::Float8, + Type::Bytea, + Type::Date, + Type::Int8, + Type::Jsonb, + Type::Uuid, +]; + +const HEX: &[char] = &[ + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', +]; + +/// Emit the *logical* (already-unescaped) text representation of column `col`. +/// Callers escape the result as needed for the target format. `text_format` +/// only affects bytea, which uses a literal backslash prefix that the text +/// escaper will later double. +fn push_value( + u: &mut Unstructured, + col: usize, + text_format: bool, + out: &mut String, +) -> arbitrary::Result<()> { + match col { + 0 => out.push_str(&u.arbitrary::()?.to_string()), + 1 => { + let n = u.int_in_range(0usize..=6)?; + for _ in 0..n { + out.push(*u.choose(&['a', 'b', 'Z', '0', '9', ' '])?); + } + } + 2 => out.push_str(u.choose(&["t", "f", "true", "false"])?), + 3 => out.push_str(u.choose(&[ + "0", "-1.5", "3.14", "1e10", "-2.5e-3", "Infinity", "-Infinity", "NaN", + ])?), + 4 => { + // bytea hex: `\x`. In COPY text the backslash must be doubled. + out.push_str(if text_format { "\\\\x" } else { "\\x" }); + for _ in 0..u.int_in_range(0usize..=6)? { + out.push(*u.choose(HEX)?); + } + } + 5 => { + let y = u.int_in_range(2000u32..=2099)?; + let m = u.int_in_range(1u32..=12)?; + let d = u.int_in_range(1u32..=28)?; + out.push_str(&format!("{y:04}-{m:02}-{d:02}")); + } + 6 => out.push_str(&u.arbitrary::()?.to_string()), + // Comma/tab/backslash-free JSON so the same value is valid in both formats. + 7 => out.push_str(u.choose(&["1", "true", "null", "\"s\"", "{\"a\":1}", "[1]"])?), + _ => { + let b: [u8; 16] = u.arbitrary()?; + for (i, byte) in b.iter().enumerate() { + if matches!(i, 4 | 6 | 8 | 10) { + out.push('-'); + } + out.push_str(&format!("{byte:02x}")); + } + } + } + Ok(()) +} + +/// Append the text column (col 1) directly as COPY-text escape sequences, so +/// the `consume_raw_value` backslash/hex/octal decoder runs through its escape +/// branches. The chosen escapes all decode to valid-UTF-8 bytes that are not a +/// delimiter or newline, so the field stays single, decodes as a `Text` value, +/// and does not disturb the line framing. +fn push_text_escapes(u: &mut Unstructured, out: &mut String) -> arbitrary::Result<()> { + let n = u.int_in_range(0usize..=4)?; + for _ in 0..n { + out.push_str(u.choose(&[ + // Literal chars (decode to themselves). + "a", "b", "Z", "0", + // Recognized C-style escapes (decode to control bytes). + "\\b", "\\f", "\\n", "\\r", "\\t", "\\v", + // Hex escapes in the printable ASCII range. + "\\x41", "\\x7e", "\\x2c", + // Octal escapes in the printable ASCII range. + "\\101", "\\052", "\\176", + // A backslash before a non-escape char drops the backslash. + "\\q", "\\\\", + ])?); + } + Ok(()) +} + +/// Append `field` to a CSV record, quoting+escaping it when it could otherwise +/// confuse the framing layer: when it contains the delimiter, quote, escape, +/// `\r`, or `\n`, when it is empty (so it is never mistaken for the empty-string +/// NULL marker), or when it equals the active NULL token (so a data value that +/// happens to match the NULL token is preserved as data rather than read as +/// SQL NULL). +fn push_csv_field(field: &str, params: &CopyCsvFormatParams, out: &mut String) { + let q = params.quote as char; + let esc = params.escape as char; + let delim = params.delimiter as char; + let needs_quote = field.is_empty() + || *field == *params.null + || field + .chars() + .any(|c| c == delim || c == q || c == esc || c == '\r' || c == '\n'); + if needs_quote { + out.push(q); + for c in field.chars() { + if c == q || c == esc { + out.push(esc); + } + out.push(c); + } + out.push(q); + } else { + out.push_str(field); + } +} + +fn decode_text(data: &[u8]) { + let _ = decode_copy_format( + data, + &COLS, + CopyFormatParams::Text(CopyTextFormatParams { + null: Cow::Borrowed("\\N"), + delimiter: b'\t', + }), + ); +} + +fn decode_csv(data: &[u8], params: CopyCsvFormatParams<'static>) { + let _ = decode_copy_format(data, &COLS, CopyFormatParams::Csv(params)); +} + +/// Build CSV params the way `impl Arbitrary for CopyCsvFormatParams` does: +/// derive the quote so it differs from the delimiter, optionally a distinct +/// escape, an optional header, and an optional non-empty NULL token. The result +/// always satisfies the `try_new` invariant (quote != delimiter). +fn arbitrary_csv_params(u: &mut Unstructured) -> arbitrary::Result> { + let delimiter: u8 = u.arbitrary()?; + // Mirror the proptest strategy: pick a non-zero difference and wrap-add it + // to the delimiter, guaranteeing quote != delimiter. + let diff = u.arbitrary::()?.saturating_sub(1).max(1); + let quote = delimiter.wrapping_add(diff); + // Half the time use a distinct escape (the non-double-quote csv-core path). + let escape = if u.arbitrary()? { + let ediff = u.arbitrary::()?.saturating_sub(1).max(1); + quote.wrapping_add(ediff) + } else { + quote + }; + let header: bool = u.arbitrary()?; + let null = if u.arbitrary()? { + // A non-empty NULL token (e.g. the conventional `NULL` / `\N`). + u.choose(&["NULL", "\\N", "null", "NA", "-"])?.to_string() + } else { + String::new() + }; + CopyCsvFormatParams::try_new( + Some(delimiter), + Some(quote), + Some(escape), + Some(header), + Some(null), + ) + .map_err(|_| arbitrary::Error::IncorrectFormat) +} + +fn run(mut u: Unstructured) -> arbitrary::Result<()> { + // A quarter of the time, feed the raw bytes through both formats: keeps the + // field/line framing and error paths covered. + if u.int_in_range(0u8..=3)? == 0 { + let rest = u.take_rest(); + decode_text(rest); + decode_csv(rest, CopyCsvFormatParams::default()); + return Ok(()); + } + + let text_format = u.int_in_range(0u8..=1)? == 0; + + if text_format { + let mut s = String::new(); + let rows = u.int_in_range(1usize..=4)?; + for _ in 0..rows { + for col in 0..COLS.len() { + if col > 0 { + s.push('\t'); + } + // 1-in-8 NULL via the `\N` token. + if u.int_in_range(0u8..=7)? == 0 { + s.push_str("\\N"); + } else if col == 1 && u.arbitrary()? { + // Drive the escape decoder through the text column. + push_text_escapes(&mut u, &mut s)?; + } else { + push_value(&mut u, col, true, &mut s)?; + } + } + s.push('\n'); + } + decode_text(s.as_bytes()); + return Ok(()); + } + + // CSV: fuzz the format params, then emit data the params can actually frame. + let params = arbitrary_csv_params(&mut u)?; + let delim = params.delimiter as char; + let mut s = String::new(); + + // A configured header means the first record is column names that the + // decoder skips. Emit a plausible one so the header-skip path runs. + if params.header { + for col in 0..COLS.len() { + if col > 0 { + s.push(delim); + } + push_csv_field(&format!("col{col}"), ¶ms, &mut s); + } + s.push('\n'); + } + + let rows = u.int_in_range(1usize..=4)?; + for _ in 0..rows { + for col in 0..COLS.len() { + if col > 0 { + s.push(delim); + } + // 1-in-8 NULL: emit the unquoted NULL token (empty field for the + // default empty marker), which the decoder reads as SQL NULL. + if u.int_in_range(0u8..=7)? == 0 { + s.push_str(¶ms.null); + } else { + let mut field = String::new(); + push_value(&mut u, col, false, &mut field)?; + push_csv_field(&field, ¶ms, &mut s); + } + } + s.push('\n'); + } + + decode_csv(s.as_bytes(), params); + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let _ = run(Unstructured::new(data)); +}); diff --git a/src/pgrepr/fuzz/.gitignore b/src/pgrepr/fuzz/.gitignore new file mode 100644 index 0000000000000..ab0eaa1a49031 --- /dev/null +++ b/src/pgrepr/fuzz/.gitignore @@ -0,0 +1,5 @@ +target/ +corpus/ +artifacts/ +coverage/ +Cargo.lock diff --git a/src/pgrepr/fuzz/Cargo.toml b/src/pgrepr/fuzz/Cargo.toml new file mode 100644 index 0000000000000..6398c36494914 --- /dev/null +++ b/src/pgrepr/fuzz/Cargo.toml @@ -0,0 +1,36 @@ +# Fuzz crate for mz-pgrepr: decode untrusted Postgres wire values. Bind +# parameters arrive from clients in text or binary format and are decoded +# per-type. The binary decoders (numeric base-10000, mz_acl_item, interval, +# unsigned ints) and the strconv-backed text decoders are hand-written +# byte/text parsers sitting at the client trust boundary, so any panic +# reachable from a parameter value is an availability bug. +# +# Excluded from the main workspace because libFuzzer requires nightly Rust. + +[package] +workspace = "../../../test/cargo-fuzz" +name = "mz-pgrepr-fuzz" +version = "0.0.0" +publish = false +edition = "2021" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" +mz-pgrepr = { path = ".." } + +[[bin]] +name = "value_decode_binary" +path = "fuzz_targets/value_decode_binary.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "value_decode_text" +path = "fuzz_targets/value_decode_text.rs" +test = false +doc = false +bench = false diff --git a/src/pgrepr/fuzz/fuzz_targets/value_decode_binary.rs b/src/pgrepr/fuzz/fuzz_targets/value_decode_binary.rs new file mode 100644 index 0000000000000..5eb8310f8a8ac --- /dev/null +++ b/src/pgrepr/fuzz/fuzz_targets/value_decode_binary.rs @@ -0,0 +1,270 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `Value::decode_binary` decodes a client-supplied bind-parameter +//! value in Postgres *binary* format. The per-type decoders are hand-written +//! big-endian byte parsers (numeric base-10000, mz_acl_item slice reads, +//! interval, unsigned ints, jsonb). This is directly client-controlled input, +//! so any panic is an availability bug. Must never panic. +//! +//! A random byte string almost never satisfies these strict decoders: an exact +//! length check, a version byte, base-10000 digit bounds, a role-id variant tag. +//! So feeding raw bytes leaves the decoders barely exercised. Instead we pick +//! a type and *encode a valid binary value for it* (numeric header + digits, the +//! 16-byte interval triple, the 26-byte mz_aclitem, a jsonb version byte plus +//! real JSON, in-range date/time/timestamp), so the decoder runs all the way to +//! the value-construction and range-check logic. We still occasionally truncate +//! the valid encoding (to hit the length-validation / short-read paths) and, a +//! quarter of the time, fall back to the "any OID, raw bytes" mode so the +//! not-implemented branches and crafted-header paths stay covered. +//! +//! A few arms also reach past the "happy" shape on purpose: the numeric header +//! sometimes carries an out-of-band weight/dscale or digit words outside +//! `0..=9999` (negative or `>9999`) to exercise the base-10000 digit-bound and +//! scale math, and the bytea/text bodies are occasionally multi-KB so the +//! decoders' allocation/validation paths run on a large value rather than a +//! handful of bytes. + +#![no_main] + +use libfuzzer_sys::arbitrary::{self, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_pgrepr::{Type, Value}; + +/// Append a run of `0..=max` printable-ASCII bytes (valid UTF-8 for the string +/// decoders). +fn push_ascii(u: &mut Unstructured, b: &mut Vec, max: usize) -> arbitrary::Result<()> { + let n = u.int_in_range(0usize..=max)?; + for _ in 0..n { + b.push(u.int_in_range(0x20u8..=0x7e)?); + } + Ok(()) +} + +/// Append a binary `RoleId`: a variant tag byte (`s`/`g`/`u`/`p`) + u64 LE id. +fn push_role_id(u: &mut Unstructured, b: &mut Vec) -> arbitrary::Result<()> { + b.push(*u.choose(&[b's', b'g', b'u', b'p'])?); + b.extend_from_slice(&u.arbitrary::()?.to_le_bytes()); + Ok(()) +} + +/// Pick a type that has a binary decoder and encode a valid value for it. +fn gen_typed_value(u: &mut Unstructured) -> arbitrary::Result<(Type, Vec)> { + let mut b = Vec::new(); + let ty = match u.int_in_range(0u8..=25)? { + 0 => { + b.push(u.int_in_range(0u8..=1)?); + Type::Bool + } + 1 => { + // Usually a short body, but occasionally a multi-KB one so the + // bytea decoder's allocation/copy path runs on a large value. + let n = if u.int_in_range(0u8..=15)? == 0 { + u.int_in_range(1024usize..=8192)? + } else { + u.int_in_range(0usize..=16)? + }; + for _ in 0..n { + b.push(u.arbitrary::()?); + } + Type::Bytea + } + 2 => { + b.push(u.arbitrary::()?); + Type::Char + } + // Date: i32 BE days since 2000-01-01. from_pg_epoch range-checks. + 3 => { + b.extend_from_slice(&u.arbitrary::()?.to_be_bytes()); + Type::Date + } + 4 => { + b.extend_from_slice(&u.arbitrary::()?.to_be_bytes()); + Type::Float4 + } + 5 => { + b.extend_from_slice(&u.arbitrary::()?.to_be_bytes()); + Type::Float8 + } + 6 => { + b.extend_from_slice(&u.arbitrary::()?.to_be_bytes()); + Type::Int2 + } + 7 => { + b.extend_from_slice(&u.arbitrary::()?.to_be_bytes()); + Type::Int4 + } + 8 => { + b.extend_from_slice(&u.arbitrary::()?.to_be_bytes()); + Type::Int8 + } + 9 => { + b.extend_from_slice(&u.arbitrary::()?.to_be_bytes()); + Type::UInt2 + } + 10 => { + b.extend_from_slice(&u.arbitrary::()?.to_be_bytes()); + Type::UInt4 + } + 11 => { + b.extend_from_slice(&u.arbitrary::()?.to_be_bytes()); + Type::UInt8 + } + // Interval: i64 micros + i32 days + i32 months, all BE (16 bytes). + 12 => { + b.extend_from_slice(&u.arbitrary::()?.to_be_bytes()); + b.extend_from_slice(&u.arbitrary::()?.to_be_bytes()); + b.extend_from_slice(&u.arbitrary::()?.to_be_bytes()); + Type::Interval { constraints: None } + } + // Jsonb: a version byte (1) followed by real JSON text. + 13 => { + b.push(1); + let json: &[u8] = match u.int_in_range(0u8..=7)? { + 0 => b"null", + 1 => b"true", + 2 => b"123", + 3 => b"-4.5", + 4 => b"\"s\"", + 5 => b"[1,2,3]", + 6 => b"{\"a\":1}", + _ => b"[]", + }; + b.extend_from_slice(json); + Type::Jsonb + } + 14 => { + push_ascii(u, &mut b, 64)?; + Type::Name + } + // Numeric: i16 ndigits, i16 weight, u16 sign, u16 dscale, then ndigits + // base-10000 words (each 0..=9999). + 15 => { + let nan = u.int_in_range(0u8..=5)? == 0; + let (ndigits, sign): (i16, u16) = if nan { + (0, 0xC000) + } else if u.int_in_range(0u8..=1)? == 0 { + (u.int_in_range(0i16..=4)?, 0x0000) + } else { + (u.int_in_range(0i16..=4)?, 0x4000) + }; + // Mostly a well-formed in-range header so the decoder reaches value + // construction. Occasionally an out-of-band weight/dscale so the + // scale/precision math runs on extreme exponents. + let weight = if u.int_in_range(0u8..=7)? == 0 { + u.arbitrary::()? + } else { + u.int_in_range(-4i16..=4)? + }; + let dscale = if u.int_in_range(0u8..=7)? == 0 { + u.arbitrary::()? + } else { + u.int_in_range(0u16..=10)? + }; + b.extend_from_slice(&ndigits.to_be_bytes()); + b.extend_from_slice(&weight.to_be_bytes()); + b.extend_from_slice(&sign.to_be_bytes()); + b.extend_from_slice(&dscale.to_be_bytes()); + // Each base-10000 digit word should be 0..=9999. Occasionally emit + // an out-of-band word (>9999, or the full i16 range incl. negative) + // to exercise the digit-bound validation path. + let oob_words = u.int_in_range(0u8..=7)? == 0; + for _ in 0..ndigits { + let word = if oob_words { + u.arbitrary::()? + } else { + u.int_in_range(0i16..=9999)? + }; + b.extend_from_slice(&word.to_be_bytes()); + } + Type::Numeric { constraints: None } + } + 16 => { + b.extend_from_slice(&u.arbitrary::()?.to_be_bytes()); + Type::Oid + } + 17 => { + // Occasionally a multi-KB UTF-8 body so the text decoder's + // validation/copy runs on a large value, not just a short one. + let max = if u.int_in_range(0u8..=15)? == 0 { 8192 } else { 16 }; + push_ascii(u, &mut b, max)?; + Type::Text + } + 18 => { + push_ascii(u, &mut b, 16)?; + Type::BpChar { length: None } + } + 19 => { + push_ascii(u, &mut b, 16)?; + Type::VarChar { max_length: None } + } + // Time: i64 BE micros since midnight, in range. + 20 => { + b.extend_from_slice(&u.int_in_range(0i64..=86_399_999_999)?.to_be_bytes()); + Type::Time { precision: None } + } + // Timestamp(tz): i64 BE micros since 2000-01-01. Keep moderate so the + // CheckedTimestamp range check is reached on the accept path. + 21 => { + let micros = u.int_in_range(-6_000_000_000_000_000i64..=6_000_000_000_000_000)?; + b.extend_from_slice(µs.to_be_bytes()); + Type::Timestamp { precision: None } + } + 22 => { + let micros = u.int_in_range(-6_000_000_000_000_000i64..=6_000_000_000_000_000)?; + b.extend_from_slice(µs.to_be_bytes()); + Type::TimestampTz { precision: None } + } + 23 => { + let uuid: [u8; 16] = u.arbitrary()?; + b.extend_from_slice(&uuid); + Type::Uuid + } + // mz_timestamp decodes a text u64. + 24 => { + b.extend_from_slice(u.arbitrary::()?.to_string().as_bytes()); + Type::MzTimestamp + } + // mz_aclitem: grantee role id, grantor role id, u64 LE acl mode. + _ => { + push_role_id(u, &mut b)?; + push_role_id(u, &mut b)?; + b.extend_from_slice(&u.arbitrary::()?.to_le_bytes()); + Type::MzAclItem + } + }; + Ok((ty, b)) +} + +fn run(mut u: Unstructured) -> arbitrary::Result<()> { + // A quarter of the time, the raw mode: any OID + raw remaining bytes. + // This keeps the not-implemented branches and the crafted-header / wrong + // length error paths covered. + if u.int_in_range(0u8..=3)? == 0 { + let oid = u32::from(u.arbitrary::()?); + let rest = u.take_rest(); + if let Ok(ty) = Type::from_oid(oid) { + let _ = Value::decode_binary(&ty, rest); + } + return Ok(()); + } + + let (ty, mut body) = gen_typed_value(&mut u)?; + // Occasionally truncate to hit the exact-length / short-read checks. + if !body.is_empty() && u.int_in_range(0u8..=7)? == 0 { + let keep = u.int_in_range(0usize..=body.len())?; + body.truncate(keep); + } + let _ = Value::decode_binary(&ty, &body); + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let _ = run(Unstructured::new(data)); +}); diff --git a/src/pgrepr/fuzz/fuzz_targets/value_decode_text.rs b/src/pgrepr/fuzz/fuzz_targets/value_decode_text.rs new file mode 100644 index 0000000000000..1ec5c744ff062 --- /dev/null +++ b/src/pgrepr/fuzz/fuzz_targets/value_decode_text.rs @@ -0,0 +1,389 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `Value::decode_text` decodes a client-supplied bind-parameter +//! value in Postgres *text* format. It dispatches on the type and delegates to +//! the `strconv` parsers (recursively, for array/list/map/record/range), all +//! over untrusted client bytes. Must never panic. +//! +//! A random byte string almost never reaches the interesting recursive +//! decoders: the array/list/map/range grammars need a leading brace/bracket and +//! a comma-separated body of *parseable element literals*, and the scalar +//! parsers (numeric, interval, timestamp, uuid, date) reject almost any random +//! ASCII. Feeding raw bytes therefore leaves the parsers stuck on their first +//! syntax check. Instead we build a `Type` directly (not via `from_oid`, which +//! cannot even produce `List`/`Map`) and synthesize a *well-formed text literal* +//! for it: scalar literals for the leaves, and properly braced, comma-separated, +//! optionally quoted/escaped, optionally `NULL`-bearing, optionally nested +//! bodies for `Array`/`List`/`Map`/`Range` (including the `empty` range and the +//! unsupported-but-parsed `[lo:hi]=` array-dimension prefix). This drives the +//! recursive element dispatch and the per-element scalar parsers all the way to +//! value construction and range/normalization checks. We still spend a quarter +//! of inputs in the "any OID, raw bytes" mode so the not-implemented branches +//! and the syntax-error paths stay covered. +//! +//! Excluded from the main workspace because libFuzzer requires nightly Rust. + +#![no_main] + +use libfuzzer_sys::arbitrary::{self, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_pgrepr::{Type, Value}; + +/// A well-formed text literal for a scalar leaf type, paired with that type. +/// The literal is in the *unnested* representation (no extra quoting). The +/// container builders re-quote/escape it as needed. +fn gen_leaf(u: &mut Unstructured) -> arbitrary::Result<(Type, String)> { + Ok(match u.int_in_range(0u8..=14)? { + 0 => ( + Type::Bool, + (*u.choose(&["true", "false", "t", "f", "yes", "no", "on", "off", "1", "0"])?) + .to_string(), + ), + // Integers: in- and out-of-range so the parse-int overflow path is hit. + 1 => (Type::Int2, gen_int_literal(u)?), + 2 => (Type::Int4, gen_int_literal(u)?), + 3 => (Type::Int8, gen_int_literal(u)?), + 4 => (Type::UInt2, gen_int_literal(u)?), + 5 => (Type::UInt4, gen_int_literal(u)?), + 6 => (Type::UInt8, gen_int_literal(u)?), + 7 => (Type::Oid, gen_int_literal(u)?), + // Floats, including the special-token branches. + 8 => ( + Type::Float8, + (*u.choose(&[ + "0", "-0", "1.5", "-2.25", "3e10", "1.2e-3", "inf", "-inf", "Infinity", "NaN", ".5", + "1e400", + ])?) + .to_string(), + ), + 9 => (Type::Float4, gen_int_literal(u)?), + // Numeric: feed digit strings, exponents, and out-of-band magnitudes. + 10 => (Type::Numeric { constraints: None }, gen_numeric_literal(u)?), + // Interval: a grab bag of the unit/ISO/SQL-standard forms. + 11 => ( + Type::Interval { constraints: None }, + (*u.choose(&[ + "1 day", + "01:02:03", + "-1 year 2 mons", + "1-2", + "P1Y2M3DT4H5M6S", + "1 day 2:03:04.567", + "@ 5 hours ago", + "100000000 years", + "1.5 days", + ])?) + .to_string(), + ), + // Date / time / timestamp(tz): valid and edge-of-range forms. + 12 => ( + Type::Date, + (*u.choose(&[ + "2000-01-01", + "0001-01-01 BC", + "294276-12-31", + "infinity", + "-infinity", + "1999-02-29", + "2024-02-29", + ])?) + .to_string(), + ), + 13 => ( + Type::Timestamp { precision: None }, + (*u.choose(&[ + "2000-01-01 00:00:00", + "1999-12-31 23:59:59.999999", + "294277-01-01 00:00:00", + "0001-01-01 00:00:00 BC", + "infinity", + "2024-02-29 12:34:56+05:30", + ])?) + .to_string(), + ), + // Uuid: canonical, braced, and hyphen-free spellings. + _ => ( + Type::Uuid, + (*u.choose(&[ + "00000000-0000-0000-0000-000000000000", + "ffffffffffffffffffffffffffffffff", + "{a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a11}", + "A0EEBC999C0B4EF8BB6D6BB9BD380A11", + ])?) + .to_string(), + ), + }) +} + +/// An integer literal: small in-range values, boundary values, signs, leading +/// zeros, whitespace, and clearly-overflowing magnitudes. +fn gen_int_literal(u: &mut Unstructured) -> arbitrary::Result { + Ok(match u.int_in_range(0u8..=6)? { + 0 => u.int_in_range(-9i64..=9)?.to_string(), + 1 => u.arbitrary::()?.to_string(), + 2 => u.arbitrary::()?.to_string(), + 3 => u.arbitrary::()?.to_string(), + 4 => format!(" {} ", u.arbitrary::()?), + 5 => format!("+{}", u.int_in_range(0u64..=u64::MAX)?), + // Way past i64/i128: forces the overflow error path. + _ => "999999999999999999999999999999".to_string(), + }) +} + +/// A numeric literal: plain digits, fractions, exponents, sign, and magnitudes +/// well beyond the 39-digit / base-10000-word limits of the numeric decoder. +fn gen_numeric_literal(u: &mut Unstructured) -> arbitrary::Result { + Ok(match u.int_in_range(0u8..=7)? { + 0 => "0".to_string(), + 1 => u.arbitrary::()?.to_string(), + 2 => format!("{}.{}", u.arbitrary::()?, u.arbitrary::()?), + 3 => format!("{}e{}", u.int_in_range(1i32..=9)?, u.int_in_range(-40i32..=40)?), + 4 => "NaN".to_string(), + 5 => "-Infinity".to_string(), + // Long digit run (more than the 39 significant digits numeric keeps). + 6 => "1".repeat(usize::from(u.int_in_range(40u8..=80)?)), + _ => format!("1e{}", u.int_in_range(100i32..=10000)?), + }) +} + +/// Escape an element body for embedding inside an array/list literal: optionally +/// wrap in double quotes (escaping `"` and `\`) or backslash-escape the +/// structural characters. Returns the body unchanged a third of the time so the +/// unquoted lexer path is exercised too. +fn escape_for_container(u: &mut Unstructured, body: &str) -> arbitrary::Result { + Ok(match u.int_in_range(0u8..=2)? { + 0 => body.to_string(), + 1 => { + let mut out = String::with_capacity(body.len() + 2); + out.push('"'); + for c in body.chars() { + if c == '"' || c == '\\' { + out.push('\\'); + } + out.push(c); + } + out.push('"'); + out + } + _ => { + let mut out = String::with_capacity(body.len()); + for c in body.chars() { + if matches!(c, '{' | '}' | ',' | '\\' | '"' | ' ') { + out.push('\\'); + } + out.push(c); + } + out + } + }) +} + +/// Recursively build a `(Type, literal)` pair, occasionally wrapping a value in +/// an `Array`, `List`, `Map`, or `Range` container with a well-formed body. +/// `depth` bounds the nesting so we always terminate. +fn gen_value(u: &mut Unstructured, depth: u8) -> arbitrary::Result<(Type, String)> { + // At max depth, or randomly, emit a scalar leaf. + if depth == 0 || u.int_in_range(0u8..=2)? == 0 { + return gen_leaf(u); + } + + Ok(match u.int_in_range(0u8..=3)? { + // Array: `{e1,e2,...}`, possibly multi-dimensional, possibly with NULLs, + // possibly prefixed with the (unsupported, but parsed) dimension syntax. + 0 => { + let (elem_ty, _) = gen_value(u, depth - 1)?; + let n = u.int_in_range(0usize..=4)?; + let mut body = String::new(); + // Occasionally emit the `[lo:hi]=` dimension prefix, which the parser + // recognizes and then rejects as unsupported. + if u.int_in_range(0u8..=7)? == 0 { + body.push_str(&format!("[{}:{}]=", u.int_in_range(-2i32..=2)?, n)); + } + // Optionally wrap in extra braces for a multi-dimensional shape. + let extra_dims = u.int_in_range(0u8..=2)?; + for _ in 0..extra_dims { + body.push('{'); + } + body.push('{'); + for i in 0..n { + if i > 0 { + body.push(','); + } + if u.int_in_range(0u8..=6)? == 0 { + body.push_str(*u.choose(&["NULL", "null", "NuLl"])?); + } else { + let (_, elem) = elem_literal(u, &elem_ty, depth - 1)?; + body.push_str(&escape_for_container(u, &elem)?); + } + } + body.push('}'); + for _ in 0..extra_dims { + body.push('}'); + } + (Type::Array(Box::new(elem_ty)), body) + } + // List: `{e1,e2,...}`. Nested lists allowed via embedded braces. + 1 => { + let (elem_ty, _) = gen_value(u, depth - 1)?; + let nested_list = matches!(elem_ty, Type::List(_)); + let n = u.int_in_range(0usize..=4)?; + let mut body = String::from("{"); + for i in 0..n { + if i > 0 { + body.push(','); + } + if u.int_in_range(0u8..=6)? == 0 { + body.push_str("NULL"); + } else { + let (_, elem) = elem_literal(u, &elem_ty, depth - 1)?; + // A nested list element keeps its braces. Other elements are + // quoted/escaped. + if nested_list { + body.push_str(&elem); + } else { + body.push_str(&escape_for_container(u, &elem)?); + } + } + } + body.push('}'); + (Type::List(Box::new(elem_ty)), body) + } + // Map: `{k1=>v1,k2=>v2,...}` with text keys. + 2 => { + let (val_ty, _) = gen_value(u, depth - 1)?; + let nested_map = matches!(val_ty, Type::Map { .. }); + let n = u.int_in_range(0usize..=4)?; + let mut body = String::from("{"); + for i in 0..n { + if i > 0 { + body.push(','); + } + let key = *u.choose(&["a", "b", "key one", "k\"q", "", "=>"])?; + body.push_str(&escape_for_container(u, key)?); + body.push_str("=>"); + if u.int_in_range(0u8..=6)? == 0 { + body.push_str("NULL"); + } else { + let (_, val) = elem_literal(u, &val_ty, depth - 1)?; + if nested_map { + body.push_str(&val); + } else { + body.push_str(&escape_for_container(u, &val)?); + } + } + } + body.push('}'); + (Type::Map { value_type: Box::new(val_ty) }, body) + } + // Range: `empty`, `[lo,hi)`, `(,hi]`, `[lo,)`, etc. Range elements must + // be a totally-ordered scalar, so restrict to one of the supported + // domains. + _ => { + let elem_ty = match u.int_in_range(0u8..=4)? { + 0 => Type::Int4, + 1 => Type::Int8, + 2 => Type::Numeric { constraints: None }, + 3 => Type::Date, + _ => Type::Timestamp { precision: None }, + }; + if u.int_in_range(0u8..=5)? == 0 { + return Ok((Type::Range { element_type: Box::new(elem_ty) }, "empty".to_string())); + } + let lo_inc = u.arbitrary::()?; + let hi_inc = u.arbitrary::()?; + let lo = if u.int_in_range(0u8..=2)? == 0 { + String::new() + } else { + gen_range_bound(u, &elem_ty)? + }; + let hi = if u.int_in_range(0u8..=2)? == 0 { + String::new() + } else { + gen_range_bound(u, &elem_ty)? + }; + let body = format!( + "{}{},{}{}", + if lo_inc { '[' } else { '(' }, + lo, + hi, + if hi_inc { ']' } else { ')' }, + ); + (Type::Range { element_type: Box::new(elem_ty) }, body) + } + }) +} + +/// Generate a literal for a *specific* element type (so container element types +/// stay consistent), bottoming out via the generic builder for containers. +fn elem_literal(u: &mut Unstructured, ty: &Type, depth: u8) -> arbitrary::Result<(Type, String)> { + match ty { + Type::Bool => Ok((ty.clone(), (*u.choose(&["t", "f", "true", "false"])?).to_string())), + Type::Int2 | Type::Int4 | Type::Int8 | Type::UInt2 | Type::UInt4 | Type::UInt8 + | Type::Oid | Type::Float4 => Ok((ty.clone(), gen_int_literal(u)?)), + Type::Float8 => Ok(( + ty.clone(), + (*u.choose(&["1.5", "-2.25", "inf", "NaN", "0"])?).to_string(), + )), + Type::Numeric { .. } => Ok((ty.clone(), gen_numeric_literal(u)?)), + Type::Date => Ok(( + ty.clone(), + (*u.choose(&["2000-01-01", "1999-12-31", "infinity"])?).to_string(), + )), + Type::Timestamp { .. } => Ok(( + ty.clone(), + (*u.choose(&["2000-01-01 00:00:00", "1999-12-31 23:59:59"])?).to_string(), + )), + Type::Uuid => Ok(( + ty.clone(), + "00000000-0000-0000-0000-000000000000".to_string(), + )), + Type::Interval { .. } => { + Ok((ty.clone(), (*u.choose(&["1 day", "01:02:03", "1-2"])?).to_string())) + } + // Containers and everything else: delegate to the recursive builder, + // which will pick its own (possibly different) shape but keep it valid. + _ => gen_value(u, depth), + } +} + +/// A scalar range-bound literal matching the range's element type. +fn gen_range_bound(u: &mut Unstructured, ty: &Type) -> arbitrary::Result { + Ok(match ty { + Type::Date => (*u.choose(&["2000-01-01", "1999-12-31", "2024-06-06"])?).to_string(), + Type::Timestamp { .. } => { + (*u.choose(&["2000-01-01 00:00:00", "2024-06-06 12:00:00"])?).to_string() + } + Type::Numeric { .. } => gen_numeric_literal(u)?, + // Int4 / Int8. + _ => gen_int_literal(u)?, + }) +} + +fn run(mut u: Unstructured) -> arbitrary::Result<()> { + // A quarter of the time, the raw mode: any OID + raw remaining bytes. + // This keeps the not-implemented branches (json, record, timetz, + // int2vector) and the scalar syntax-error paths covered. + if u.int_in_range(0u8..=3)? == 0 { + let oid = u32::from(u.arbitrary::()?); + let rest = u.take_rest(); + if let Ok(ty) = Type::from_oid(oid) { + let _ = Value::decode_text(&ty, rest); + } + return Ok(()); + } + + let (ty, body) = gen_value(&mut u, 3)?; + let _ = Value::decode_text(&ty, body.as_bytes()); + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let _ = run(Unstructured::new(data)); +}); diff --git a/src/pgtz/fuzz/.gitignore b/src/pgtz/fuzz/.gitignore new file mode 100644 index 0000000000000..ab0eaa1a49031 --- /dev/null +++ b/src/pgtz/fuzz/.gitignore @@ -0,0 +1,5 @@ +target/ +corpus/ +artifacts/ +coverage/ +Cargo.lock diff --git a/src/pgtz/fuzz/Cargo.toml b/src/pgtz/fuzz/Cargo.toml new file mode 100644 index 0000000000000..11619c80b435c --- /dev/null +++ b/src/pgtz/fuzz/Cargo.toml @@ -0,0 +1,26 @@ +# Fuzz crate for mz-pgtz: parse untrusted time-zone strings. `Timezone::parse` +# is a hand-written tokenizer + offset builder over arbitrary user input (the +# `AT TIME ZONE` / `SET timezone` value), so any panic is an availability bug. +# +# Excluded from the main workspace because libFuzzer requires nightly Rust. + +[package] +workspace = "../../../test/cargo-fuzz" +name = "mz-pgtz-fuzz" +version = "0.0.0" +publish = false +edition = "2021" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" +mz-pgtz = { path = ".." } + +[[bin]] +name = "timezone_parse" +path = "fuzz_targets/timezone_parse.rs" +test = false +doc = false +bench = false diff --git a/src/pgtz/fuzz/fuzz_targets/timezone_parse.rs b/src/pgtz/fuzz/fuzz_targets/timezone_parse.rs new file mode 100644 index 0000000000000..6eaeb83420a34 --- /dev/null +++ b/src/pgtz/fuzz/fuzz_targets/timezone_parse.rs @@ -0,0 +1,223 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `mz_pgtz::timezone::Timezone::parse` parses untrusted time-zone +//! strings (the `AT TIME ZONE` / `SET timezone` value) with a hand-written +//! tokenizer + offset builder, in both ISO and POSIX modes. Any panic is an +//! availability bug. +//! +//! The interesting surface is the *offset tokenizer*: `tokenize_timezone` +//! grabs the first alphabetic run as a single `TzName` and returns immediately +//! (so any POSIX DST-rule tail is silently discarded, making fuzzing that +//! grammar dead weight), while everything else flows through `parse_num`, which +//! splits long all-digit runs into `[..hhhh]mm` chunks unless a `:` is present, +//! plus the punctuation-as-delimiter trimming and the `z`/`Z`-only-at-end rule. +//! `build_timezone_offset_second` then matches the token stream against twelve +//! fixed `±H[H][:M[M][:S[S]]]` / `±HHH` / `TzName` / `Zulu` shapes and enforces +//! the `hour<=15`, `min<60`, `sec<60` bounds. So we generate inputs that stress +//! exactly that math: long all-digit runs (`+00000100`, `+0000001:000001`), +//! the hour/min/sec boundaries (`+15:59:59`, `+16`, `+0:60`), the colon-vs-no- +//! colon `split_nums` toggle, punctuation-delimited junk around a real offset, +//! bare `z`/`Z` placed mid-string vs at the end, abbreviations drawn from +//! `TIMEZONE_ABBREVS`, and case-mangled IANA names. A quarter of inputs stay +//! the raw bytes so the tokenizer reject paths keep their coverage. + +#![no_main] + +use libfuzzer_sys::arbitrary::{self, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_pgtz::timezone::{Timezone, TimezoneSpec}; + +/// IANA names exercising fractional-hour offsets and DST, in canonical casing. +/// `gen_named` may re-case them to hit the case-insensitive lookup path. +const NAMED: &[&str] = &[ + "UTC", + "GMT", + "America/New_York", + "Europe/London", + "Asia/Kolkata", // :30 offset + "Australia/Lord_Howe", // :30 offset with DST + "Pacific/Chatham", // :45 offset + "America/Argentina/Buenos_Aires", + "Etc/GMT+12", + "posixrules", +]; + +/// A spread of abbreviations from `TIMEZONE_ABBREVS`: fixed-offset ones, DST +/// ones, and ones that alias to a `Tz`, so the abbrev lookup + fallback to +/// `Tz::from_str_insensitive` both run. `EST`/`PST`/... also double as the +/// leading `std` name of a POSIX-looking string (whose offset tail is what the +/// tokenizer actually keeps). +const ABBREVS: &[&str] = &[ + "EST", "EDT", "PST", "PDT", "CST", "CDT", "MST", "MDT", "CET", "CEST", "EET", + "EEST", "BST", "IST", "JST", "ACDT", "ACST", "AEST", "AEDT", "NZST", "NZDT", + "CHADT", "CHAST", "HKT", "WET", "WEST", "UCT", "ZULU", "GMT", "UTC", +]; + +/// Emit a numeric UTC offset, biased toward the tokenizer/builder boundaries: +/// `z`/`Z`, `±HH`, `±HH:MM`, `±HH:MM:SS`, long all-digit runs that `parse_num` +/// must chunk, and the exact `hour<=15` / `min<60` / `sec<60` edges. +fn gen_offset(u: &mut Unstructured, out: &mut String) -> arbitrary::Result<()> { + match u.int_in_range(0u8..=8)? { + // Bare Zulu (only valid at end-of-string). + 0 => { + out.push(if u.ratio(1, 2)? { 'z' } else { 'Z' }); + return Ok(()); + } + // Hour at/around the `<= 15` boundary. + 1 => { + out.push(sign(u)?); + out.push_str(&format!("{:02}", u.int_in_range(13u32..=17)?)); + } + // `±HH:MM` with minute at/around the `< 60` boundary. + 2 => { + out.push(sign(u)?); + out.push_str(&format!( + "{:02}:{:02}", + u.int_in_range(0u32..=15)?, + u.int_in_range(57u32..=61)? + )); + } + // `±HH:MM:SS` with second at/around the `< 60` boundary, e.g. `+15:59:59`. + 3 => { + out.push(sign(u)?); + out.push_str(&format!( + "{:02}:{:02}:{:02}", + u.int_in_range(0u32..=15)?, + u.int_in_range(0u32..=59)?, + u.int_in_range(57u32..=61)? + )); + } + // Long all-digit run (no colon): exercises the `split_nums` `[..hh]mm` + // chunking and leading-zero handling, e.g. `+00000100`, `+0000005`. + 4 => { + out.push(sign(u)?); + let zeros = u.int_in_range(0u32..=8)?; + for _ in 0..zeros { + out.push('0'); + } + out.push_str(&u.int_in_range(0u32..=999)?.to_string()); + } + // Colon-delimited long all-digit runs (colon disables `split_nums`), + // e.g. `+0000001:000001:000001`. + 5 => { + out.push(sign(u)?); + let parts = u.int_in_range(1u8..=3)?; + for p in 0..parts { + if p > 0 { + out.push(':'); + } + let zeros = u.int_in_range(0u32..=7)?; + for _ in 0..zeros { + out.push('0'); + } + out.push_str(&u.int_in_range(0u32..=99)?.to_string()); + } + } + // Ordinary `±HH[:MM[:SS]]` across the full valid range. + _ => { + out.push(sign(u)?); + out.push_str(&format!("{:02}", u.int_in_range(0u32..=15)?)); + match u.int_in_range(0u8..=2)? { + 0 => {} + 1 => out.push_str(&format!(":{:02}", *u.choose(&[0u32, 15, 30, 45])?)), + _ => out.push_str(&format!( + ":{:02}:{:02}", + *u.choose(&[0u32, 30, 45])?, + u.int_in_range(0u32..=59)? + )), + } + } + } + Ok(()) +} + +fn sign(u: &mut Unstructured) -> arbitrary::Result { + Ok(if u.ratio(1, 2)? { '+' } else { '-' }) +} + +/// Emit an IANA name, sometimes case-mangled to hit `from_str_insensitive`. +fn gen_named(u: &mut Unstructured, out: &mut String) -> arbitrary::Result<()> { + let name = *u.choose(NAMED)?; + match u.int_in_range(0u8..=3)? { + 0 => out.push_str(&name.to_lowercase()), + 1 => out.push_str(&name.to_uppercase()), + 2 => { + // Alternate-case mangling. + for (i, c) in name.chars().enumerate() { + if i % 2 == 0 { + out.extend(c.to_lowercase()); + } else { + out.extend(c.to_uppercase()); + } + } + } + _ => out.push_str(name), + } + Ok(()) +} + +/// Wrap an inner spec in leading/trailing whitespace and ASCII punctuation, +/// which the tokenizer trims (except `+`/`-`) or treats as `Delim`. This keeps +/// the `" ! ? ! - 5:15 ? ! ? "`-style paths covered. +fn gen_punct_wrapped(u: &mut Unstructured, out: &mut String) -> arbitrary::Result<()> { + const JUNK: &[char] = &[' ', '!', '?', '.', ',', '*', '/', '#', '~', '\t']; + let lead = u.int_in_range(0u8..=3)?; + for _ in 0..lead { + out.push(*u.choose(JUNK)?); + } + gen_offset(u, out)?; + let trail = u.int_in_range(0u8..=3)?; + for _ in 0..trail { + out.push(*u.choose(JUNK)?); + } + Ok(()) +} + +fn gen_tz(u: &mut Unstructured, out: &mut String) -> arbitrary::Result<()> { + match u.int_in_range(0u8..=6)? { + 0 => gen_named(u, out)?, + 1 => out.push_str(u.choose(ABBREVS)?), + 2 | 3 => gen_offset(u, out)?, + 4 => gen_punct_wrapped(u, out)?, + // An abbreviation immediately followed by an offset: the tokenizer keeps + // the abbrev as a `TzName` and returns, so the offset tail is ignored, + // but this still stresses the "first alpha wins" early return. + 5 => { + out.push_str(u.choose(ABBREVS)?); + gen_offset(u, out)?; + } + // A bare `z`/`Z` placed *before* more text, so it is NOT at end-of-string + // and must be tokenized as a `TzName`, not `Zulu`. + _ => { + out.push(if u.ratio(1, 2)? { 'z' } else { 'Z' }); + gen_offset(u, out)?; + } + } + Ok(()) +} + +fn run(mut u: Unstructured) -> arbitrary::Result<()> { + // A quarter of the time, the raw bytes: keeps the tokenizer reject paths + // covered. + let spec = if u.int_in_range(0u8..=3)? == 0 { + String::from_utf8_lossy(u.take_rest()).into_owned() + } else { + let mut s = String::new(); + gen_tz(&mut u, &mut s)?; + s + }; + let _ = Timezone::parse(&spec, TimezoneSpec::Iso); + let _ = Timezone::parse(&spec, TimezoneSpec::Posix); + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let _ = run(Unstructured::new(data)); +}); diff --git a/src/pgwire/Cargo.toml b/src/pgwire/Cargo.toml index e5057681ca1ab..cd8abca228bcf 100644 --- a/src/pgwire/Cargo.toml +++ b/src/pgwire/Cargo.toml @@ -45,3 +45,6 @@ uuid = { workspace = true, features = ["v4"] } [features] default = [] +# Re-exports internal types (`codec::Codec` etc.) so the fuzz crate can +# drive the frontend-message decoder directly. Not for production use. +fuzzing = [] diff --git a/src/pgwire/fuzz/.gitignore b/src/pgwire/fuzz/.gitignore new file mode 100644 index 0000000000000..ab0eaa1a49031 --- /dev/null +++ b/src/pgwire/fuzz/.gitignore @@ -0,0 +1,5 @@ +target/ +corpus/ +artifacts/ +coverage/ +Cargo.lock diff --git a/src/pgwire/fuzz/Cargo.toml b/src/pgwire/fuzz/Cargo.toml new file mode 100644 index 0000000000000..0297c8efd1484 --- /dev/null +++ b/src/pgwire/fuzz/Cargo.toml @@ -0,0 +1,27 @@ +# Fuzz crate for mz-pgwire: drive the frontend-message decoder over random +# bytes. The decoder sits at the trust boundary between an untrusted SQL +# client and environmentd, so any panic/SEGV reachable from the wire format +# is a real availability bug. + +[package] +workspace = "../../../test/cargo-fuzz" +name = "mz-pgwire-fuzz" +version = "0.0.0" +publish = false +edition = "2021" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" +mz-pgwire = { path = "..", features = ["fuzzing"] } +bytes = "1" +tokio-util = { version = "0.7", features = ["codec"] } + +[[bin]] +name = "codec_decode" +path = "fuzz_targets/codec_decode.rs" +test = false +doc = false +bench = false diff --git a/src/pgwire/fuzz/corpus.dict b/src/pgwire/fuzz/corpus.dict new file mode 100644 index 0000000000000..81586cb1f734a --- /dev/null +++ b/src/pgwire/fuzz/corpus.dict @@ -0,0 +1,34 @@ +# libFuzzer dictionary for the pgwire codec_decode target. +# +# Frontend messages are a 1-byte type tag + 4-byte big-endian length + payload +# (startup/SSL/cancel messages omit the tag). Seeding the type tags and the +# special startup/SSL/cancel magic version codes lets the mutator reach the +# per-message decoders instead of being rejected as an unknown tag. + +# Frontend message type tags. +"Q" +"P" +"B" +"E" +"D" +"C" +"H" +"S" +"X" +"d" +"c" +"f" +"p" +"F" +# Untagged startup-phase length+version prefixes. +"\x00\x00\x00\x08\x04\xd2\x16\x2f" +"\x00\x00\x00\x10\x04\xd2\x16\x2e" +"\x00\x03\x00\x00" +# Common startup parameter keys. +"user\x00" +"database\x00" +"application_name\x00" +"client_encoding\x00" +# SASL mechanism names. +"SCRAM-SHA-256" +"SCRAM-SHA-256-PLUS" diff --git a/src/pgwire/fuzz/fuzz_targets/codec_decode.rs b/src/pgwire/fuzz/fuzz_targets/codec_decode.rs new file mode 100644 index 0000000000000..a15c5491ec13e --- /dev/null +++ b/src/pgwire/fuzz/fuzz_targets/codec_decode.rs @@ -0,0 +1,291 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: drive `mz_pgwire`'s frontend-message `Codec` over arbitrary +//! bytes. The decoder lives at the trust boundary between SQL clients and +//! environmentd, so any panic/SEGV reachable from the wire is a real +//! availability bug. +//! +//! A frame is `[type:1][len:4 BE][body:len-4]`. Random bytes rarely have a +//! length field that matches the bytes that follow, so the decoder bails in the +//! header before reaching the per-message body parsers (Query/Parse/Bind/ +//! Describe/Execute/…), and once one frame errors, the streaming decoder stops, +//! so later frames never decode either. So we consume the byte stream as grammar +//! choices and emit correctly-framed messages: a valid type tag, the right +//! length, and (usually) a valid body for that type, concatenating several so +//! the decoder walks frame after frame. A quarter of inputs are still the raw +//! bytes, and a quarter of frames carry an arbitrary body, so the header +//! validation and per-message error paths stay covered. +//! +//! Beyond well-formed frames we deliberately stress two thin spots: +//! +//! * **Count-driven loops.** The body parsers for Parse and Bind read an `i16` +//! element count (param-type / format-code / parameter counts) and then loop +//! that many times reading from the body, and a Bind parameter declares its +//! own `i32` byte length. We sometimes emit a huge count or length (up to +//! `i16::MAX` / a large positive `i32`) backed by a body far too short to +//! satisfy it, so the loops read off the end of the cursor and must error out +//! gracefully rather than over-read, over-allocate, or panic. Long cstrings +//! feed the same idea on the string side. +//! +//! * **Streaming / partial-frame reassembly.** The codec is a `tokio_util` +//! `Decoder`: it advances `Head -> Data -> Head` across calls and returns +//! `Ok(None)` whenever the body promised by the length field hasn't fully +//! arrived yet. Feeding the whole stream in one `BytesMut` never lands mid +//! frame, so we (a) sometimes hand frames a length field that overstates the +//! real body, and (b) drip the byte stream into the decoder in arbitrary +//! chunks, so it parks in the `Data` await-more-bytes state and resumes when +//! the rest shows up. +//! +//! Errors are expected. What we assert is the absence of panics and +//! memory-safety violations. + +#![no_main] + +use bytes::BytesMut; +use libfuzzer_sys::arbitrary::{self, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_pgwire::fuzz_exports::Codec; +use tokio_util::codec::Decoder; + +/// Frontend message type tags the codec dispatches on. +const TAGS: &[u8] = &[ + b'Q', b'P', b'D', b'B', b'E', b'H', b'S', b'C', b'X', b'p', b'f', b'd', b'c', +]; + +fn push_cstr(u: &mut Unstructured, out: &mut Vec) -> arbitrary::Result<()> { + let n = u.int_in_range(0usize..=8)?; + for _ in 0..n { + // Printable, non-NUL. + out.push(u.int_in_range(0x20u8..=0x7e)?); + } + out.push(0); + Ok(()) +} + +/// Append a long (but bounded) printable, NUL-terminated string. Stresses the +/// `read_cstr` scan and downstream allocations without blowing past +/// `MAX_REQUEST_SIZE`. +fn push_long_cstr(u: &mut Unstructured, out: &mut Vec) -> arbitrary::Result<()> { + let n = u.int_in_range(64usize..=4096)?; + let fill = u.int_in_range(0x20u8..=0x7e)?; + out.resize(out.len() + n, fill); + out.push(0); + Ok(()) +} + +fn be16(out: &mut Vec, v: i16) { + out.extend_from_slice(&v.to_be_bytes()); +} + +fn be32(out: &mut Vec, v: i32) { + out.extend_from_slice(&v.to_be_bytes()); +} + +/// Pick an element count for a count-driven loop. Usually small and matched by +/// the body that follows, but sometimes a large value the body can't satisfy so +/// the loop reads off the end of the cursor and must error rather than over-read +/// or over-allocate. Returns `(declared_count, honest_count)`: `declared_count` +/// is written to the wire, `honest_count` is how many elements we actually emit. +fn count(u: &mut Unstructured) -> arbitrary::Result<(i16, i16)> { + match u.int_in_range(0u8..=7)? { + // Mostly: a small, honest count fully backed by the body. + 0..=4 => { + let n = u.int_in_range(0i16..=3)?; + Ok((n, n)) + } + // A large declared count with no/too-few backing elements: the loop + // should run out of buffer and bail. + 5 => Ok((u.int_in_range(1i16..=i16::MAX)?, 0)), + 6 => Ok((i16::MAX, u.int_in_range(0i16..=2)?)), + // A negative count: the `for _ in 0..n` loop runs zero times, so the + // remaining body is interpreted as the next field/frame. + 7 => Ok((u.int_in_range(i16::MIN..=-1)?, 0)), + _ => unreachable!(), + } +} + +/// Build a valid body for message `tag`. +fn gen_body(u: &mut Unstructured, tag: u8, out: &mut Vec) -> arbitrary::Result<()> { + match tag { + // Empty-body messages. + b'X' | b'S' | b'H' | b'c' => {} + // Simple query / copy-fail: a single cstring. + b'Q' | b'f' => maybe_long_cstr(u, out)?, + // Password / generic auth: a cstring is a plausible password message. + b'p' => maybe_long_cstr(u, out)?, + // CopyData: arbitrary payload. + b'd' => { + for _ in 0..u.int_in_range(0usize..=16)? { + out.push(u.arbitrary::()?); + } + } + // Describe / Close: a 'S'tatement|'P'ortal byte then a name cstring. + b'D' | b'C' => { + out.push(if u.int_in_range(0u8..=1)? == 0 { b'S' } else { b'P' }); + maybe_long_cstr(u, out)?; + } + // Execute: portal cstring + max-rows i32. + b'E' => { + maybe_long_cstr(u, out)?; + be32(out, u.arbitrary::()?); + } + // Parse: name + query cstrings + param-type oids. + b'P' => { + maybe_long_cstr(u, out)?; + maybe_long_cstr(u, out)?; + let (declared, honest) = count(u)?; + be16(out, declared); + for _ in 0..honest { + be32(out, u.arbitrary::()?); + } + } + // Bind: portal + stmt cstrings, format codes, parameters, result formats. + b'B' => { + maybe_long_cstr(u, out)?; + maybe_long_cstr(u, out)?; + let (declared, honest) = count(u)?; + be16(out, declared); + for _ in 0..honest { + be16(out, u.int_in_range(0i16..=1)?); + } + let (declared, honest) = count(u)?; + be16(out, declared); + for _ in 0..honest { + match u.int_in_range(0u8..=4)? { + 0 => be32(out, -1), // NULL parameter + // A large declared length with a short (or empty) value: the + // per-byte read loop should run out of buffer and bail. + 1 => { + be32(out, u.int_in_range(1i32..=i32::MAX)?); + for _ in 0..u.int_in_range(0usize..=2)? { + out.push(u.arbitrary::()?); + } + } + _ => { + let len = u.int_in_range(0usize..=4)?; + be32(out, len as i32); + for _ in 0..len { + out.push(u.arbitrary::()?); + } + } + } + } + let (declared, honest) = count(u)?; + be16(out, declared); + for _ in 0..honest { + be16(out, u.int_in_range(0i16..=1)?); + } + } + _ => {} + } + Ok(()) +} + +/// A cstring that is usually short but occasionally long, to stress the scan +/// and downstream string allocations. +fn maybe_long_cstr(u: &mut Unstructured, out: &mut Vec) -> arbitrary::Result<()> { + if u.int_in_range(0u8..=7)? == 0 { + push_long_cstr(u, out) + } else { + push_cstr(u, out) + } +} + +fn push_frame(u: &mut Unstructured, out: &mut Vec) -> arbitrary::Result<()> { + let tag = *u.choose(TAGS)?; + let mut body = Vec::new(); + // A quarter of frames carry an arbitrary body so the per-message parsers' + // error handling stays covered. The rest are valid for their type. + if u.int_in_range(0u8..=3)? == 0 { + for _ in 0..u.int_in_range(0usize..=16)? { + body.push(u.arbitrary::()?); + } + } else { + gen_body(u, tag, &mut body)?; + } + out.push(tag); + // Length field counts itself (4 bytes) plus the body, but not the tag. + // Usually honest, but occasionally we overstate it so the streaming decoder + // parks in the `Data` await-more-bytes state expecting bytes that may or may + // not arrive (a later frame's bytes get reinterpreted as this body, or the + // stream simply ends mid-frame). + let honest = (body.len() as u32) + 4; + let declared = if u.int_in_range(0u8..=7)? == 0 { + // Overstate by a bounded amount. `parse_frame_len` rejects anything over + // MAX_FRAME_SIZE (64 MiB), so keep the claim well under that. + honest.saturating_add(u.int_in_range(1u32..=4096)?) + } else { + honest + }; + out.extend_from_slice(&declared.to_be_bytes()); + out.extend_from_slice(&body); + Ok(()) +} + +/// Feed `data` to the codec. When `chunked`, drip it in arbitrary-sized slices +/// so the decoder repeatedly parks in its partial-frame (`Ok(None)`) state and +/// resumes when more bytes land, exercising the streaming reassembly path that +/// a single all-at-once `BytesMut` never reaches mid-frame. +fn pump(u: &mut Unstructured, data: &[u8], chunked: bool) -> arbitrary::Result<()> { + let mut codec = Codec::new(); + let mut buf = BytesMut::new(); + + let mut feed_and_drain = |buf: &mut BytesMut| { + // The codec is a streaming decoder, so pump it until it stops returning + // complete messages or errors out. Errors are expected. What we care + // about is the absence of panics and memory-safety violations. + loop { + match codec.decode(buf) { + Ok(Some(_msg)) => continue, + Ok(None) => break, + Err(_) => break, + } + } + }; + + if chunked { + let mut rest = data; + while !rest.is_empty() { + let take = u.int_in_range(1usize..=rest.len())?.min(rest.len()); + buf.extend_from_slice(&rest[..take]); + rest = &rest[take..]; + feed_and_drain(&mut buf); + } + // A final drain in case the last chunk completed a frame. + feed_and_drain(&mut buf); + } else { + buf.extend_from_slice(data); + feed_and_drain(&mut buf); + } + Ok(()) +} + +fn run(mut u: Unstructured) -> arbitrary::Result<()> { + // A quarter of the time, the raw bytes: keeps the header-framing and + // unknown-tag error paths covered. + if u.int_in_range(0u8..=3)? == 0 { + let chunked = u.int_in_range(0u8..=1)? == 0; + let rest = u.take_rest(); + return pump(&mut Unstructured::new(rest), rest, chunked); + } + let mut out = Vec::new(); + let frames = u.int_in_range(1usize..=5)?; + for _ in 0..frames { + push_frame(&mut u, &mut out)?; + } + // Half the time, drip the assembled stream into the decoder in chunks to + // exercise the partial-frame await-more-bytes logic mid-stream. + let chunked = u.int_in_range(0u8..=1)? == 0; + pump(&mut u, &out, chunked) +} + +fuzz_target!(|data: &[u8]| { + let _ = run(Unstructured::new(data)); +}); diff --git a/src/pgwire/fuzz/prepare-corpus.sh b/src/pgwire/fuzz/prepare-corpus.sh new file mode 100755 index 0000000000000..0c68bfc61aef8 --- /dev/null +++ b/src/pgwire/fuzz/prepare-corpus.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash + +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. +# +# prepare-corpus.sh populates the `codec_decode` corpus with hand-crafted +# valid pgwire frontend frames. libFuzzer mutates these into adjacent +# malformed variants while keeping enough structure to reach the deeper +# parsing paths. + +set -euo pipefail + +cd "$(dirname "$0")" + +corpus=corpus/codec_decode +mkdir -p "$corpus" +find "$corpus" -maxdepth 1 -name 'seed_*.bin' -delete + +python3 - "$corpus" <<'PY' +import os, struct, sys +corpus = sys.argv[1] + +def frame(tag: bytes, payload: bytes) -> bytes: + # Standard pgwire frame: 1-byte type tag + 4-byte BE length (incl + # itself) + payload. + return tag + struct.pack(">I", 4 + len(payload)) + payload + +def cstr(s: str) -> bytes: + return s.encode() + b"\x00" + +def startup() -> bytes: + # Startup messages have no tag, just length + version + key/value pairs. + payload = struct.pack(">I", 0x0003_0000) + payload += cstr("user") + cstr("mz") + payload += cstr("database") + cstr("materialize") + payload += cstr("application_name") + cstr("fuzz") + payload += b"\x00" + return struct.pack(">I", 4 + len(payload)) + payload + +seeds = { + "01_startup": startup(), + "02_ssl_request": struct.pack(">II", 8, 0x04D2_162F), + "03_cancel_request": struct.pack(">IIII", 16, 0x04D2_162E, 12345, 67890), + "04_query_select_1": frame(b"Q", cstr("SELECT 1")), + "05_query_empty": frame(b"Q", cstr("")), + "06_parse_named": frame(b"P", cstr("stmt1") + cstr("SELECT $1::int4") + struct.pack(">H", 0)), + "07_bind_unnamed": frame(b"B", cstr("") + cstr("") + struct.pack(">HHH", 0, 0, 0) + struct.pack(">H", 0)), + "08_execute": frame(b"E", cstr("") + struct.pack(">I", 0)), + "09_sync": frame(b"S", b""), + "10_terminate": frame(b"X", b""), + "11_describe_portal": frame(b"D", b"P" + cstr("")), + "12_describe_statement": frame(b"D", b"S" + cstr("")), + "13_close_portal": frame(b"C", b"P" + cstr("")), + "14_flush": frame(b"H", b""), + "15_copy_data": frame(b"d", b"some-bytes"), + "16_copy_done": frame(b"c", b""), + "17_copy_fail": frame(b"f", cstr("client gave up")), + "18_password": frame(b"p", cstr("hunter2")), + "19_sasl_initial": frame(b"p", cstr("SCRAM-SHA-256") + struct.pack(">I", 5) + b"hello"), + "20_function_call": frame(b"F", struct.pack(">IHH", 100, 0, 0) + struct.pack(">H", 0) + struct.pack(">H", 0)), +} + +for name, blob in seeds.items(): + with open(os.path.join(corpus, f"seed_{name}.bin"), "wb") as f: + f.write(blob) +PY + +echo "Seeded:" +count=$(find "$corpus" -maxdepth 1 -name '*.bin' | wc -l) +printf " %-40s %4d seeds\n" "$corpus" "$count" diff --git a/src/pgwire/src/lib.rs b/src/pgwire/src/lib.rs index 8cc24da9dcf46..cc34b764f5f4b 100644 --- a/src/pgwire/src/lib.rs +++ b/src/pgwire/src/lib.rs @@ -33,3 +33,12 @@ mod server; pub use metrics::MetricsConfig; pub use protocol::match_handshake; pub use server::{Config, Server}; + +/// Internal types re-exported under `cfg(feature = "fuzzing")` so the fuzz +/// crate can drive the frontend-message decoder directly. Not for +/// production use. +#[cfg(feature = "fuzzing")] +pub mod fuzz_exports { + pub use crate::codec::Codec; + pub use mz_pgwire_common::FrontendMessage; +} diff --git a/src/postgres-util/fuzz/.gitignore b/src/postgres-util/fuzz/.gitignore new file mode 100644 index 0000000000000..ab0eaa1a49031 --- /dev/null +++ b/src/postgres-util/fuzz/.gitignore @@ -0,0 +1,5 @@ +target/ +corpus/ +artifacts/ +coverage/ +Cargo.lock diff --git a/src/postgres-util/fuzz/Cargo.toml b/src/postgres-util/fuzz/Cargo.toml new file mode 100644 index 0000000000000..0be72968f1fc6 --- /dev/null +++ b/src/postgres-util/fuzz/Cargo.toml @@ -0,0 +1,29 @@ +# Fuzz crate for mz-postgres-util desc proto round-trip. `PostgresTableDesc` +# describes external-database schemas, so a decoder bug here is reachable from +# a compromised upstream Postgres or from on-disk catalog bytes. + +[package] +workspace = "../../../test/cargo-fuzz" +name = "mz-postgres-util-fuzz" +version = "0.0.0" +publish = false +edition = "2021" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" +# `schemas` (a default feature) is what gates `desc` + its proptest `Arbitrary` +# impls. Make the dependency explicit so the structured arm always builds. +mz-postgres-util = { path = "..", features = ["schemas"] } +mz-proto = { path = "../../proto" } +proptest = "1" +prost = "0.14.3" + +[[bin]] +name = "postgres_table_desc_proto_roundtrip" +path = "fuzz_targets/postgres_table_desc_proto_roundtrip.rs" +test = false +doc = false +bench = false diff --git a/src/postgres-util/fuzz/fuzz_targets/postgres_table_desc_proto_roundtrip.rs b/src/postgres-util/fuzz/fuzz_targets/postgres_table_desc_proto_roundtrip.rs new file mode 100644 index 0000000000000..9021238101157 --- /dev/null +++ b/src/postgres-util/fuzz/fuzz_targets/postgres_table_desc_proto_roundtrip.rs @@ -0,0 +1,184 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `ProtoPostgresTableDesc` <-> `PostgresTableDesc` round-trip. +//! `PostgresTableDesc` describes external-database schemas, so a decoder bug +//! here is reachable from a compromised upstream Postgres or on-disk catalog +//! bytes. +//! +//! The first input byte selects the arm. The rest feeds it: +//! +//! * **Structured arm.** Drives `PostgresTableDesc`'s proptest `Arbitrary` +//! (behind mz-postgres-util's `schemas` feature) from the libFuzzer byte +//! stream to synthesize a *valid, fully-populated* value, then asserts the +//! `value -> proto -> value` chain is the identity AND that re-encoding the +//! proto is byte-idempotent. This is what actually reaches the deep shape: +//! several `PostgresColumnDesc`s with arbitrary `col_num`/`type_oid`/ +//! `type_mod`/`nullable`, and a `BTreeSet` of several `PostgresKeyDesc`s, +//! each with a `Vec` of `cols`. Random proto bytes decode to a +//! near-empty desc, so the populated branches never get covered otherwise. +//! * **Raw-bytes arm.** Decodes arbitrary bytes straight into the proto then +//! `into_rust`, exercising the decoder against malformed/adversarial input, +//! then re-encodes the recovered value. This is where the untrusted-bytes +//! invariants live: the `u32 -> u16` narrowing for +//! `ProtoPostgresColumnDesc::col_num` and `ProtoPostgresKeyDesc::cols` must +//! return `Err`, not panic, and the wire's repeated `keys` field with +//! duplicate / unsorted entries must collapse cleanly into the Rust +//! `BTreeSet` rather than trip an ordering assertion. + +#![no_main] + +use libfuzzer_sys::fuzz_target; +use mz_postgres_util::desc::{ + PostgresTableDesc, ProtoPostgresColumnDesc, ProtoPostgresKeyDesc, ProtoPostgresTableDesc, +}; +use mz_proto::ProtoType; +use proptest::arbitrary::Arbitrary; +use proptest::strategy::{Strategy, ValueTree}; +use proptest::test_runner::{Config, RngAlgorithm, TestRng, TestRunner}; +use prost::Message; + +/// Build a 32-byte proptest seed from `bytes` (zero-padded / truncated). +fn seed_from(bytes: &[u8]) -> [u8; 32] { + let mut seed = [0u8; 32]; + let n = bytes.len().min(32); + seed[..n].copy_from_slice(&bytes[..n]); + seed +} + +/// Assert the `value -> proto -> value` chain is the identity and that +/// re-encoding the proto is byte-idempotent. +fn assert_roundtrip(orig: PostgresTableDesc) { + let proto = >::from_rust(&orig); + let bytes = proto.encode_to_vec(); + let proto2 = ProtoPostgresTableDesc::decode(bytes.as_slice()) + .expect("re-encode of valid PostgresTableDesc must decode"); + let round: PostgresTableDesc = proto2 + .into_rust() + .expect("re-encoded PostgresTableDesc must convert back to Rust"); + assert_eq!(orig, round, "PostgresTableDesc changed across proto roundtrip"); + + // Encoding the recovered value must reproduce the same wire bytes. + let bytes2 = >::from_rust(&round) + .encode_to_vec(); + assert_eq!(bytes, bytes2, "proto re-encode was not idempotent"); +} + +/// Decode adversarial proto bytes, convert to Rust, then round-trip. +fn raw_roundtrip(data: &[u8]) { + let Ok(proto) = ProtoPostgresTableDesc::decode(data) else { + return; + }; + // `into_rust` may legitimately reject (e.g. a `col_num`/`cols` value that + // overflows `u16`). It must do so via `Err`, never a panic. + let Ok(orig): Result = proto.into_rust() else { + return; + }; + assert_roundtrip(orig); +} + +fuzz_target!(|data: &[u8]| { + let Some((&mode, rest)) = data.split_first() else { + return; + }; + + match mode % 4 { + // Structured arm: synthesize a valid, fully-populated desc. + 0 => { + let mut runner = TestRunner::new_with_rng( + Config::default(), + TestRng::from_seed(RngAlgorithm::ChaCha, &seed_from(rest)), + ); + if let Ok(tree) = PostgresTableDesc::arbitrary().new_tree(&mut runner) { + assert_roundtrip(tree.current()); + } + } + // Targeted arm: hand-build a proto whose `col_num` / `cols` values + // straddle the u16 boundary, confirming the u32 -> u16 narrowing in + // `from_proto` returns `Err` (not a panic) for the out-of-range cases + // and succeeds for the in-range ones. + 1 => { + // Use the first few bytes as little-endian u32 candidates so the + // fuzzer can search both sides of the 65535 boundary. + let take_u32 = |i: usize| -> u32 { + let mut buf = [0u8; 4]; + let n = rest.len().saturating_sub(i * 4).min(4); + if n > 0 { + buf[..n].copy_from_slice(&rest[i * 4..i * 4 + n]); + } + u32::from_le_bytes(buf) + }; + let col_num = take_u32(0); + let key_col = take_u32(1); + + let proto = ProtoPostgresTableDesc { + name: "t".into(), + namespace: "n".into(), + oid: 1, + columns: vec![ProtoPostgresColumnDesc { + name: "c".into(), + type_oid: 23, + type_mod: -1, + nullable: true, + col_num: Some(col_num), + }], + keys: vec![ProtoPostgresKeyDesc { + oid: 2, + name: "k".into(), + cols: vec![key_col], + is_primary: true, + nulls_not_distinct: false, + }], + }; + let bytes = proto.encode_to_vec(); + let decoded = ProtoPostgresTableDesc::decode(bytes.as_slice()) + .expect("hand-built proto must decode"); + // Whether the narrowing fits, the only requirement is no panic. If + // it converts, the value must round-trip. + let converted: Result = decoded.into_rust(); + if let Ok(orig) = converted { + assert_roundtrip(orig); + } + } + // Targeted arm: a wire proto with duplicate and unsorted `keys`. The + // repeated field maps to a Rust `BTreeSet`, which dedups + sorts. This + // must collapse cleanly with no ordering/dup assertion firing. + 2 => { + let mk = |oid: u32, col: u16| ProtoPostgresKeyDesc { + oid, + name: "k".into(), + cols: vec![u32::from(col)], + is_primary: false, + nulls_not_distinct: false, + }; + // Intentionally out of order with a duplicate entry. + let proto = ProtoPostgresTableDesc { + name: "t".into(), + namespace: "n".into(), + oid: 7, + columns: vec![ProtoPostgresColumnDesc { + name: "c".into(), + type_oid: 23, + type_mod: -1, + nullable: false, + col_num: Some(1), + }], + keys: vec![mk(3, 2), mk(1, 9), mk(3, 2), mk(2, 0)], + }; + let bytes = proto.encode_to_vec(); + let decoded = ProtoPostgresTableDesc::decode(bytes.as_slice()) + .expect("hand-built proto must decode"); + let orig: PostgresTableDesc = + decoded.into_rust().expect("duplicate/unsorted keys must convert"); + assert_roundtrip(orig); + } + // Raw-bytes arm: decode adversarial proto bytes, then round-trip. + _ => raw_roundtrip(rest), + } +}); diff --git a/src/repr/fuzz/.gitignore b/src/repr/fuzz/.gitignore new file mode 100644 index 0000000000000..ab0eaa1a49031 --- /dev/null +++ b/src/repr/fuzz/.gitignore @@ -0,0 +1,5 @@ +target/ +corpus/ +artifacts/ +coverage/ +Cargo.lock diff --git a/src/repr/fuzz/Cargo.toml b/src/repr/fuzz/Cargo.toml new file mode 100644 index 0000000000000..b3b460015a154 --- /dev/null +++ b/src/repr/fuzz/Cargo.toml @@ -0,0 +1,203 @@ +# Fuzz crate for mz-repr proto round-trip properties. `Row` is the core +# value representation in Materialize, serialized via prost as `ProtoRow`. +# +# Excluded from the main workspace because libFuzzer requires nightly Rust. +# Run via the repo-wide runner: `bin/ci-builder run nightly ci/test/cargo-fuzz.sh`, +# or locally: +# cd src/repr/fuzz +# cargo +nightly fuzz run row_proto_roundtrip -- -max_total_time=60 + +[package] +workspace = "../../../test/cargo-fuzz" +name = "mz-repr-fuzz" +version = "0.0.0" +publish = false +edition = "2021" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" +mz-persist-types = { path = "../../persist-types" } +mz-proto = { path = "../../proto" } +mz-repr = { path = "..", features = ["proptest"] } +chrono = { version = "0.4", default-features = false } +dec = "0.4.9" +prost = "0.14.3" +proptest = "1" + +[[bin]] +name = "row_proto_roundtrip" +path = "fuzz_targets/row_proto_roundtrip.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "row_codec_roundtrip" +path = "fuzz_targets/row_codec_roundtrip.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "scalar_type_proto_roundtrip" +path = "fuzz_targets/scalar_type_proto_roundtrip.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "column_type_proto_roundtrip" +path = "fuzz_targets/column_type_proto_roundtrip.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "relation_desc_proto_roundtrip" +path = "fuzz_targets/relation_desc_proto_roundtrip.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "interval_proto_roundtrip" +path = "fuzz_targets/interval_proto_roundtrip.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "mz_acl_item_proto_roundtrip" +path = "fuzz_targets/mz_acl_item_proto_roundtrip.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "acl_item_proto_roundtrip" +path = "fuzz_targets/acl_item_proto_roundtrip.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "strconv_parse_array" +path = "fuzz_targets/strconv_parse_array.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "strconv_parse_list" +path = "fuzz_targets/strconv_parse_list.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "strconv_parse_map" +path = "fuzz_targets/strconv_parse_map.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "strconv_parse_range" +path = "fuzz_targets/strconv_parse_range.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "strconv_parse_numeric" +path = "fuzz_targets/strconv_parse_numeric.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "strconv_parse_interval" +path = "fuzz_targets/strconv_parse_interval.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "jsonb_from_slice" +path = "fuzz_targets/jsonb_from_slice.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "strconv_parse_timestamp" +path = "fuzz_targets/strconv_parse_timestamp.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "strconv_parse_timestamptz" +path = "fuzz_targets/strconv_parse_timestamptz.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "strconv_parse_date" +path = "fuzz_targets/strconv_parse_date.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "strconv_parse_time" +path = "fuzz_targets/strconv_parse_time.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "interval_arith" +path = "fuzz_targets/interval_arith.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "numeric_arith" +path = "fuzz_targets/numeric_arith.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "range_ops" +path = "fuzz_targets/range_ops.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "row_arrow_roundtrip" +path = "fuzz_targets/row_arrow_roundtrip.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "strconv_parse_bytes" +path = "fuzz_targets/strconv_parse_bytes.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "strconv_parse_uuid" +path = "fuzz_targets/strconv_parse_uuid.rs" +test = false +doc = false +bench = false diff --git a/src/repr/fuzz/fuzz_targets/acl_item_proto_roundtrip.rs b/src/repr/fuzz/fuzz_targets/acl_item_proto_roundtrip.rs new file mode 100644 index 0000000000000..1629f7016313b --- /dev/null +++ b/src/repr/fuzz/fuzz_targets/acl_item_proto_roundtrip.rs @@ -0,0 +1,77 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `AclItem` proto round-trips losslessly. +//! Distinct from `MzAclItem`: `AclItem` is the PostgreSQL-style ACL entry. +//! +//! Two arms (the first byte selects): +//! - Arbitrary arm: drive `AclItem`'s proptest `Arbitrary` strategy from the +//! fuzzer bytes to build a *valid* entry (grantee/grantor Oids + an arbitrary +//! `AclMode` bit flag) and assert `from_proto(into_proto(v)) == v`. +//! - Raw-bytes arm: decode arbitrary bytes as `ProtoAclItem`, into Rust, and +//! re-encode, keeping coverage of the bare wire decoder against hostile +//! input. + +#![no_main] + +use libfuzzer_sys::fuzz_target; +use mz_proto::{ProtoType, RustType}; +use mz_repr::adt::mz_acl_item::{AclItem, ProtoAclItem}; +use proptest::strategy::{Strategy, ValueTree}; +use proptest::test_runner::{Config, RngAlgorithm, TestRng, TestRunner}; +use prost::Message; + +fn arbitrary_arm(seed: &[u8]) { + let mut buf = [0u8; 32]; + for (dst, src) in buf.iter_mut().zip(seed.iter()) { + *dst = *src; + } + let rng = TestRng::from_seed(RngAlgorithm::ChaCha, &buf); + let mut runner = TestRunner::new_with_rng(Config::default(), rng); + let value = + match ::arbitrary().new_tree(&mut runner) { + Ok(tree) => tree.current(), + Err(_) => return, + }; + + let proto = value.into_proto(); + let back = AclItem::from_proto(proto).expect("valid AclItem must round-trip"); + assert_eq!(value, back, "AclItem changed across proto roundtrip"); +} + +fn raw_arm(data: &[u8]) { + let Ok(proto) = ProtoAclItem::decode(data) else { + return; + }; + let orig: AclItem = match proto.into_rust() { + Ok(v) => v, + Err(_) => return, + }; + + let proto2 = >::from_rust(&orig); + let bytes2 = proto2.encode_to_vec(); + let proto3 = ProtoAclItem::decode(bytes2.as_slice()) + .expect("re-encode of valid AclItem must decode"); + let round: AclItem = proto3 + .into_rust() + .expect("re-encoded AclItem must convert back to Rust"); + + assert_eq!(orig, round, "AclItem changed across proto roundtrip"); +} + +fuzz_target!(|data: &[u8]| { + let Some((&mode, rest)) = data.split_first() else { + return; + }; + if mode & 1 == 0 { + arbitrary_arm(rest); + } else { + raw_arm(rest); + } +}); diff --git a/src/repr/fuzz/fuzz_targets/column_type_proto_roundtrip.rs b/src/repr/fuzz/fuzz_targets/column_type_proto_roundtrip.rs new file mode 100644 index 0000000000000..82781158f9fe3 --- /dev/null +++ b/src/repr/fuzz/fuzz_targets/column_type_proto_roundtrip.rs @@ -0,0 +1,79 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `SqlColumnType` proto round-trips losslessly. +//! +//! Two arms (the first byte selects): +//! - Arbitrary arm: drive `SqlColumnType`'s proptest `Arbitrary` strategy from +//! the fuzzer bytes to build a *valid* column type pairing a deeply-nested +//! `SqlScalarType` with a nullable flag, and assert +//! `from_proto(into_proto(v)) == v`. Random proto bytes leave the inner +//! scalar type near-empty. This arm reaches the recursive scalar variants. +//! - Raw-bytes arm: decode arbitrary bytes as `ProtoColumnType`, into Rust, and +//! re-encode, keeping coverage of the bare wire decoder against hostile +//! input. + +#![no_main] + +use libfuzzer_sys::fuzz_target; +use mz_proto::{ProtoType, RustType}; +use mz_repr::{ProtoColumnType, SqlColumnType}; +use proptest::strategy::{Strategy, ValueTree}; +use proptest::test_runner::{Config, RngAlgorithm, TestRng, TestRunner}; +use prost::Message; + +fn arbitrary_arm(seed: &[u8]) { + let mut buf = [0u8; 32]; + for (dst, src) in buf.iter_mut().zip(seed.iter()) { + *dst = *src; + } + let rng = TestRng::from_seed(RngAlgorithm::ChaCha, &buf); + let mut runner = TestRunner::new_with_rng(Config::default(), rng); + let value = match ::arbitrary() + .new_tree(&mut runner) + { + Ok(tree) => tree.current(), + Err(_) => return, + }; + + let proto = value.into_proto(); + let back = SqlColumnType::from_proto(proto).expect("valid SqlColumnType must round-trip"); + assert_eq!(value, back, "SqlColumnType changed across proto roundtrip"); +} + +fn raw_arm(data: &[u8]) { + let Ok(proto) = ProtoColumnType::decode(data) else { + return; + }; + let orig: SqlColumnType = match proto.into_rust() { + Ok(v) => v, + Err(_) => return, + }; + + let proto2 = >::from_rust(&orig); + let bytes2 = proto2.encode_to_vec(); + let proto3 = ProtoColumnType::decode(bytes2.as_slice()) + .expect("re-encode of valid SqlColumnType must decode"); + let round: SqlColumnType = proto3 + .into_rust() + .expect("re-encoded SqlColumnType must convert back to Rust"); + + assert_eq!(orig, round, "SqlColumnType changed across proto roundtrip"); +} + +fuzz_target!(|data: &[u8]| { + let Some((&mode, rest)) = data.split_first() else { + return; + }; + if mode & 1 == 0 { + arbitrary_arm(rest); + } else { + raw_arm(rest); + } +}); diff --git a/src/repr/fuzz/fuzz_targets/interval_arith.rs b/src/repr/fuzz/fuzz_targets/interval_arith.rs new file mode 100644 index 0000000000000..4d4e3c44084fd --- /dev/null +++ b/src/repr/fuzz/fuzz_targets/interval_arith.rs @@ -0,0 +1,163 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `Interval` arithmetic. Intervals are built from user input and +//! combined with checked arithmetic that must never panic (overflow returns +//! `None`/`Err`, not a crash) and must respect basic algebra. +//! +//! Oracles (beyond no-panic on every op): +//! * add is commutative; +//! * add is associative (`(a+b)+c == a+(b+c)`) whenever both groupings succeed; +//! * additive inverse: `a + (-a)` is the zero interval (when `checked_neg` and +//! the add both succeed); +//! * the sketchy fractional-carry `as`-cast path in `checked_mul`/`checked_op` +//! is cross-checked against an exact integer reference: for a small whole +//! factor `k`, `checked_mul(s, k)` must equal `k` repeated `checked_add`s, +//! and `checked_mul(s, 1.0) == s`. Because that path routes the i64 `micros` +//! through `f64`, the identity only holds while the fields and their products +//! stay inside `f64`'s exact-integer range (2^53). The cross-checked operand +//! `s` is masked into a small range so the comparison is sound (an unbounded +//! operand legitimately loses precision and is NOT a bug); +//! * `justify_days` / `justify_hours` / `justify_interval` preserve the +//! interval's total microseconds (they only redistribute fields using the +//! fixed 30-day-month / 24-hour-day ratios that `as_microseconds` also uses). + +#![no_main] + +use libfuzzer_sys::arbitrary::{self, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_repr::adt::interval::Interval; + +/// Negate every field with checked arithmetic, mirroring `Interval`'s +/// `CheckedNeg` impl without pulling in the `num-traits` trait. Returns `None` +/// if any field is `MIN` (its negation overflows). +fn checked_neg(a: &Interval) -> Option { + Some(Interval::new( + a.months.checked_neg()?, + a.days.checked_neg()?, + a.micros.checked_neg()?, + )) +} + +/// Add `a` to itself `k` times via `checked_add`, the exact integer reference +/// for `checked_mul(a, k as f64)`. Returns `None` on any intermediate overflow. +fn repeated_add(a: &Interval, k: u32) -> Option { + let mut acc = Interval::new(0, 0, 0); + for _ in 0..k { + acc = acc.checked_add(a)?; + } + Some(acc) +} + +/// Bounds chosen so that, for the small factor `k <= 16`, every field AND its +/// product `field * k` stay (a) exactly representable in `f64` (< 2^53) and +/// (b) inside the destination integer's range, so neither `checked_mul`'s +/// `as`-cast path nor `repeated_add`'s integer path overflows. That makes the +/// two computations a clean, exact reference for each other. +const MONTHS_DAYS_BOUND: i32 = 1 << 26; // *16 = 2^30 < i32::MAX, f64-exact +const MICROS_BOUND: i64 = 1 << 49; // *16 = 2^53, f64-exact, < i64::MAX + +fn check(a: Interval, b: Interval, c: Interval, s: Interval, factor: f64, small_k: u8) { + // --- Addition is commutative, and overflow is symmetric. -------------- + assert_eq!( + a.checked_add(&b), + b.checked_add(&a), + "interval checked_add is not commutative" + ); + + // --- Addition is associative when both groupings succeed. ------------- + let left = a.checked_add(&b).and_then(|ab| ab.checked_add(&c)); + let right = b.checked_add(&c).and_then(|bc| a.checked_add(&bc)); + // Only compare when neither side overflowed. If one overflows the other may + // legitimately not (intermediate sums differ in magnitude). + if let (Some(left), Some(right)) = (left, right) { + assert_eq!(left, right, "interval checked_add is not associative"); + } + + // --- Additive inverse: a + (-a) == 0. --------------------------------- + if let Some(neg_a) = checked_neg(&a) { + if let Some(sum) = a.checked_add(&neg_a) { + assert_eq!( + sum, + Interval::new(0, 0, 0), + "a + (-a) must be the zero interval" + ); + } + } + + // --- Multiply through the `as`-cast path, cross-checked exactly. ------ + // `s` is masked so every field and its product with `k` fits exactly in + // `f64`. Otherwise the i64->f64->i64 round-trip in `checked_op` is lossy and + // the identities below would not hold (that loss is expected, not a bug). + let s = Interval::new( + s.months % MONTHS_DAYS_BOUND, + s.days % MONTHS_DAYS_BOUND, + s.micros % MICROS_BOUND, + ); + + // Multiply-by-one is the identity on the bounded operand. + if let Some(prod) = s.checked_mul(1.0) { + assert_eq!(prod, s, "checked_mul(s, 1.0) must equal s"); + } + + // checked_mul(s, k) == k repeated checked_adds, for small whole k. `k` is + // small so repeated_add stays cheap and the products stay f64-exact. + let k = u32::from(small_k % 17); // 0..=16 + let by_mul = s.checked_mul(f64::from(k)); + let by_add = repeated_add(&s, k); + // Both must agree on success/overflow and, on success, on the value. + assert_eq!( + by_mul, by_add, + "checked_mul(s, {k}) disagrees with {k} repeated checked_adds" + ); + + // --- justify_* preserve total microseconds. --------------------------- + let total = a.as_microseconds(); + if let Ok(j) = a.justify_days() { + assert_eq!( + j.as_microseconds(), + total, + "justify_days changed total microseconds" + ); + } + if let Ok(j) = a.justify_hours() { + assert_eq!( + j.as_microseconds(), + total, + "justify_hours changed total microseconds" + ); + } + if let Ok(j) = a.justify_interval() { + assert_eq!( + j.as_microseconds(), + total, + "justify_interval changed total microseconds" + ); + } + + // --- Remaining ops just must not panic on any input, incl. NaN/inf. --- + let _ = a.checked_mul(factor); + let _ = a.checked_div(factor); + let _ = b.checked_mul(factor); +} + +fn run(mut u: Unstructured) -> arbitrary::Result<()> { + let a = Interval::new(u.arbitrary()?, u.arbitrary()?, u.arbitrary()?); + let b = Interval::new(u.arbitrary()?, u.arbitrary()?, u.arbitrary()?); + let c = Interval::new(u.arbitrary()?, u.arbitrary()?, u.arbitrary()?); + let s = Interval::new(u.arbitrary()?, u.arbitrary()?, u.arbitrary()?); + let factor: f64 = u.arbitrary()?; + let small_k: u8 = u.arbitrary()?; + check(a, b, c, s, factor, small_k); + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let _ = run(Unstructured::new(data)); +}); diff --git a/src/repr/fuzz/fuzz_targets/interval_proto_roundtrip.rs b/src/repr/fuzz/fuzz_targets/interval_proto_roundtrip.rs new file mode 100644 index 0000000000000..8e01d256cac7f --- /dev/null +++ b/src/repr/fuzz/fuzz_targets/interval_proto_roundtrip.rs @@ -0,0 +1,76 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `Interval` proto round-trips losslessly. +//! +//! Two arms (the first byte selects): +//! - Arbitrary arm: drive `Interval`'s proptest `Arbitrary` strategy from the +//! fuzzer bytes to build a *valid* interval (months/days/micros across the +//! full range) and assert `from_proto(into_proto(v)) == v`. +//! - Raw-bytes arm: decode arbitrary bytes as `ProtoInterval`, into Rust, and +//! re-encode, keeping coverage of the bare wire decoder against hostile +//! input. + +#![no_main] + +use libfuzzer_sys::fuzz_target; +use mz_proto::{ProtoType, RustType}; +use mz_repr::adt::interval::{Interval, ProtoInterval}; +use proptest::strategy::{Strategy, ValueTree}; +use proptest::test_runner::{Config, RngAlgorithm, TestRng, TestRunner}; +use prost::Message; + +fn arbitrary_arm(seed: &[u8]) { + let mut buf = [0u8; 32]; + for (dst, src) in buf.iter_mut().zip(seed.iter()) { + *dst = *src; + } + let rng = TestRng::from_seed(RngAlgorithm::ChaCha, &buf); + let mut runner = TestRunner::new_with_rng(Config::default(), rng); + let value = + match ::arbitrary().new_tree(&mut runner) { + Ok(tree) => tree.current(), + Err(_) => return, + }; + + let proto = value.into_proto(); + let back = Interval::from_proto(proto).expect("valid Interval must round-trip"); + assert_eq!(value, back, "Interval changed across proto roundtrip"); +} + +fn raw_arm(data: &[u8]) { + let Ok(proto) = ProtoInterval::decode(data) else { + return; + }; + let orig: Interval = match proto.into_rust() { + Ok(v) => v, + Err(_) => return, + }; + + let proto2 = >::from_rust(&orig); + let bytes2 = proto2.encode_to_vec(); + let proto3 = ProtoInterval::decode(bytes2.as_slice()) + .expect("re-encode of valid Interval must decode"); + let round: Interval = proto3 + .into_rust() + .expect("re-encoded Interval must convert back to Rust"); + + assert_eq!(orig, round, "Interval changed across proto roundtrip"); +} + +fuzz_target!(|data: &[u8]| { + let Some((&mode, rest)) = data.split_first() else { + return; + }; + if mode & 1 == 0 { + arbitrary_arm(rest); + } else { + raw_arm(rest); + } +}); diff --git a/src/repr/fuzz/fuzz_targets/jsonb_from_slice.rs b/src/repr/fuzz/fuzz_targets/jsonb_from_slice.rs new file mode 100644 index 0000000000000..c378d019cac24 --- /dev/null +++ b/src/repr/fuzz/fuzz_targets/jsonb_from_slice.rs @@ -0,0 +1,253 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `Jsonb::from_slice` decodes untrusted JSON bytes (Kafka/webhook +//! source bodies) into Materialize's JSONB `Row` representation, doing recursive +//! object/array packing with key dedup and numeric coercion. Beyond not +//! panicking, its `Display` rendering must re-parse to the same value. +//! +//! Random bytes almost never form valid JSON, so byte mutation barely reaches +//! past the first token. The recursive packing, numeric coercion, and key dedup +//! (and the Display round-trip oracle, which only runs on a value that parsed) +//! stay shallow. So we consume the byte stream as grammar choices and emit valid +//! JSON, biased toward the bug-prone paths: numbers that stress the +//! arbitrary-precision coercion (huge exponents, long digit runs, `-0`), objects +//! with duplicate keys (the dedup path), nested arrays/objects, and strings with +//! escapes. A quarter of inputs are still the raw bytes, so the parser's reject +//! paths and non-UTF-8 handling stay covered. +//! +//! Two extra shapes probe edges shallow generation misses: a deep single-spine +//! nesting (`[[[…]]]` / `{"a":{"a":…}}`) that walks the recursive collector up +//! toward serde_json's nesting limit (where it must `Err`, not overflow the +//! stack), and an object whose key is literally `$serde_json::private::Number`, +//! serde_json's private arbitrary-precision-number token. Such an object is +//! reinterpreted as a number on the way back in, so we don't run the re-parse +//! oracle on it. We only assert `from_slice` doesn't panic and that `Display` +//! renders fully without panicking. + +#![no_main] + +use libfuzzer_sys::arbitrary::{self, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_repr::adt::jsonb::Jsonb; + +/// Valid JSON numbers chosen to stress the arbitrary-precision numeric coercion: +/// big integers, tiny/huge exponents, negative zero, long fractions. +const NUMBERS: &[&str] = &[ + "0", + "-0", + "1", + "-1", + "42", + "3.14", + "-2.5", + "1e10", + "1e-10", + "1e308", + "1e1000", + "-1e-1000", + "1.7976931348623157e308", + "123456789012345678901234567890", + "0.00000000000000000001", + "9999999999999999999999999999999999999999", +]; + +/// Object keys, including the duplicate-prone short set (so dedup fires) and one +/// that needs escaping. +const KEYS: &[&str] = &["a", "b", "a", "k", "\"q\\\"x\"", "\"\""]; + +fn push_string(u: &mut Unstructured, out: &mut String) -> arbitrary::Result<()> { + out.push('"'); + let n = u.int_in_range(0usize..=5)?; + for _ in 0..n { + match u.int_in_range(0u8..=6)? { + 0 => out.push_str("\\\""), + 1 => out.push_str("\\\\"), + 2 => out.push_str("\\n"), + 3 => out.push_str("\\u0041"), + 4 => out.push_str("\\uD83D\\uDE00"), // surrogate pair + _ => out.push(*u.choose(&['a', 'z', '0', ' ', '!'])?), + } + } + out.push('"'); + Ok(()) +} + +fn gen_json(u: &mut Unstructured, depth: u32, out: &mut String) -> arbitrary::Result<()> { + let leaf = depth == 0 || u.is_empty(); + let choice = if leaf { + u.int_in_range(0u8..=4)? + } else { + u.int_in_range(0u8..=6)? + }; + match choice { + 0 => out.push_str("null"), + 1 => out.push_str(if u.int_in_range(0u8..=1)? == 0 { + "true" + } else { + "false" + }), + 2 => out.push_str(u.choose(NUMBERS)?), + 3 => push_string(u, out)?, + 4 => { + // A generated number (valid JSON: leading 1-9, optional frac/exp). + if u.int_in_range(0u8..=1)? == 0 { + out.push('-'); + } + out.push(*u.choose(&['1', '2', '3', '4', '5', '6', '7', '8', '9'])?); + for _ in 0..u.int_in_range(0usize..=12)? { + out.push(*u.choose(&['0', '1', '5', '9'])?); + } + if u.int_in_range(0u8..=1)? == 0 { + out.push('.'); + for _ in 0..u.int_in_range(1usize..=4)? { + out.push(*u.choose(&['0', '1', '9'])?); + } + } + } + 5 => { + out.push('['); + let n = u.int_in_range(0usize..=4)?; + for i in 0..n { + if i > 0 { + out.push(','); + } + gen_json(u, depth - 1, out)?; + } + out.push(']'); + } + _ => { + out.push('{'); + let n = u.int_in_range(0usize..=4)?; + for i in 0..n { + if i > 0 { + out.push(','); + } + out.push_str(u.choose(KEYS)?); // may repeat -> dedup path + out.push(':'); + gen_json(u, depth - 1, out)?; + } + out.push('}'); + } + } + Ok(()) +} + +/// Emit a deep single-spine nesting (`[[[…leaf…]]]` or `{"a":{"a":…leaf…}}`) to +/// walk the recursive collector down toward serde_json's nesting limit. Beyond +/// the limit `from_slice` must return an `Err` (handled by `check`), never +/// overflow the stack. +fn gen_deep_spine(u: &mut Unstructured, depth: u32, out: &mut String) -> arbitrary::Result<()> { + let use_obj = u.int_in_range(0u8..=1)? == 0; + for _ in 0..depth { + if use_obj { + out.push_str("{\"a\":"); + } else { + out.push('['); + } + } + gen_json(u, 0, out)?; + for _ in 0..depth { + out.push(if use_obj { '}' } else { ']' }); + } + Ok(()) +} + +/// Render the object `{"$serde_json::private::Number": }`. serde_json +/// (built with `arbitrary_precision`) treats this exact key as its private +/// number token, so the object is reinterpreted as a number rather than packed +/// as an object. That asymmetry defeats the strict re-parse oracle but must +/// still not panic in `from_slice`/`Display`. +fn gen_private_number(u: &mut Unstructured, out: &mut String) -> arbitrary::Result<()> { + out.push_str("{\"$serde_json::private::Number\":"); + // The value serde_json expects here is a string holding the digits, but feed + // a variety (string number, junk string, real number, nested) to probe how + // the number sniffer copes. + match u.int_in_range(0u8..=3)? { + 0 => { + out.push('"'); + out.push_str(u.choose(NUMBERS)?); + out.push('"'); + } + 1 => out.push_str("\"not-a-number\""), + 2 => out.push_str(u.choose(NUMBERS)?), + _ => gen_json(u, 1, out)?, + } + out.push('}'); + Ok(()) +} + +/// Parse, and only require that `Display` renders fully without panicking. Used +/// for shapes whose round trip is inherently lossy (the private number token). +fn check_no_panic(data: &[u8]) { + if let Ok(j) = Jsonb::from_slice(data) { + // Force the full Display rendering, which must not panic. + let _ = j.to_string(); + } +} + +fn check(data: &[u8]) { + let Ok(j) = Jsonb::from_slice(data) else { + return; + }; + let formatted = j.to_string(); + // serde_json (which mz-repr builds with `arbitrary_precision`) represents + // every JSON number internally as a single-key map + // `{"$serde_json::private::Number": ""}`. A JSON *object* that uses + // that exact key is therefore indistinguishable from a number on the way + // back in: JSONB sorts an object's keys for display, this `$`-prefixed token + // sorts ahead of ordinary keys, and the value is then reinterpreted as a + // (possibly invalid) number. That asymmetry is an inherent serde_json + // limitation, not a JSONB round-trip bug, and `from_slice` itself never + // panics on it. So skip the re-parse check whenever the rendering leans on + // serde_json's private number token. + const SERDE_JSON_NUMBER_TOKEN: &str = "$serde_json::private::Number"; + if formatted.contains(SERDE_JSON_NUMBER_TOKEN) { + return; + } + let reparsed: Jsonb = formatted.parse().expect("jsonb Display must re-parse"); + assert_eq!(j, reparsed, "jsonb changed across parse/format round trip"); +} + +fn run(mut u: Unstructured) -> arbitrary::Result<()> { + // A quarter of the time, the raw bytes: keeps the parser's reject paths and + // non-UTF-8 handling covered. + if u.int_in_range(0u8..=3)? == 0 { + check(u.take_rest()); + return Ok(()); + } + match u.int_in_range(0u8..=9)? { + // A deep single-spine nesting that probes the recursion limit. Depth is + // chosen to straddle serde_json's default (~128): well within, right at, + // and past it (where it must `Err`, not overflow). + 0 => { + let depth = u.int_in_range(1u32..=200)?; + let mut s = String::new(); + gen_deep_spine(&mut u, depth, &mut s)?; + check(s.as_bytes()); + } + // The private-number-token object: assert no panic + total Display only. + 1 => { + let mut s = String::new(); + gen_private_number(&mut u, &mut s)?; + check_no_panic(s.as_bytes()); + } + // The common case: a wide, moderately deep random JSON value. + _ => { + let mut s = String::new(); + gen_json(&mut u, 6, &mut s)?; + check(s.as_bytes()); + } + } + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let _ = run(Unstructured::new(data)); +}); diff --git a/src/repr/fuzz/fuzz_targets/mz_acl_item_proto_roundtrip.rs b/src/repr/fuzz/fuzz_targets/mz_acl_item_proto_roundtrip.rs new file mode 100644 index 0000000000000..4e82c4225d77d --- /dev/null +++ b/src/repr/fuzz/fuzz_targets/mz_acl_item_proto_roundtrip.rs @@ -0,0 +1,80 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `MzAclItem` proto round-trips losslessly. +//! ACL items are access-control data. A decoder bug here is a security +//! concern (a malformed proto could decode into a value the rest of the +//! catalog doesn't expect). +//! +//! Two arms (the first byte selects): +//! - Arbitrary arm: drive `MzAclItem`'s proptest `Arbitrary` strategy from the +//! fuzzer bytes to build a *valid* entry (grantee/grantor `RoleId`s across +//! all variants + an arbitrary `AclMode` bit flag) and assert +//! `from_proto(into_proto(v)) == v`. +//! - Raw-bytes arm: decode arbitrary bytes as `ProtoMzAclItem`, into Rust, and +//! re-encode, keeping coverage of the bare wire decoder against hostile +//! input. + +#![no_main] + +use libfuzzer_sys::fuzz_target; +use mz_proto::{ProtoType, RustType}; +use mz_repr::adt::mz_acl_item::{MzAclItem, ProtoMzAclItem}; +use proptest::strategy::{Strategy, ValueTree}; +use proptest::test_runner::{Config, RngAlgorithm, TestRng, TestRunner}; +use prost::Message; + +fn arbitrary_arm(seed: &[u8]) { + let mut buf = [0u8; 32]; + for (dst, src) in buf.iter_mut().zip(seed.iter()) { + *dst = *src; + } + let rng = TestRng::from_seed(RngAlgorithm::ChaCha, &buf); + let mut runner = TestRunner::new_with_rng(Config::default(), rng); + let value = + match ::arbitrary().new_tree(&mut runner) { + Ok(tree) => tree.current(), + Err(_) => return, + }; + + let proto = value.into_proto(); + let back = MzAclItem::from_proto(proto).expect("valid MzAclItem must round-trip"); + assert_eq!(value, back, "MzAclItem changed across proto roundtrip"); +} + +fn raw_arm(data: &[u8]) { + let Ok(proto) = ProtoMzAclItem::decode(data) else { + return; + }; + let orig: MzAclItem = match proto.into_rust() { + Ok(v) => v, + Err(_) => return, + }; + + let proto2 = >::from_rust(&orig); + let bytes2 = proto2.encode_to_vec(); + let proto3 = ProtoMzAclItem::decode(bytes2.as_slice()) + .expect("re-encode of valid MzAclItem must decode"); + let round: MzAclItem = proto3 + .into_rust() + .expect("re-encoded MzAclItem must convert back to Rust"); + + assert_eq!(orig, round, "MzAclItem changed across proto roundtrip"); +} + +fuzz_target!(|data: &[u8]| { + let Some((&mode, rest)) = data.split_first() else { + return; + }; + if mode & 1 == 0 { + arbitrary_arm(rest); + } else { + raw_arm(rest); + } +}); diff --git a/src/repr/fuzz/fuzz_targets/numeric_arith.rs b/src/repr/fuzz/fuzz_targets/numeric_arith.rs new file mode 100644 index 0000000000000..47f2bb87c5f5f --- /dev/null +++ b/src/repr/fuzz/fuzz_targets/numeric_arith.rs @@ -0,0 +1,251 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `Numeric` (128-bit decimal) arithmetic through the standard +//! datum context. +//! +//! Operands are generated two ways so the fuzzer reaches values that decimal +//! text rarely produces: +//! * parsed untrusted decimal text (`parse_numeric`), and +//! * raw coefficient + exponent pairs (`from_i128` then `scaleb`), which lets +//! the fuzzer pin a full 39-digit coefficient at an arbitrary scale, hitting +//! the precision/exponent extremes of `cx_datum` that random text misses. +//! +//! Oracles (beyond no-panic on every op, including `rem`/`div` by zero and +//! `pow`/`ln`/`log10`/`sqrt`/`round`/`rescale` on every value): +//! * add/mul are commutative; +//! * add/mul are associative *when the involved ops were exact* (decimal +//! rounding legitimately breaks associativity, so an `inexact` flag guards +//! the assertion); +//! * additive inverse: `a + (-a) == 0`; +//! * cancellation: `(a + b) - b == a` when exact; +//! * identities `a + 0 == a`, `a * 1 == a`; +//! * `sqrt` of a non-negative value is non-negative, and `abs` is +//! non-negative. + +#![no_main] + +use libfuzzer_sys::arbitrary::{self, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_repr::adt::numeric::{cx_datum, Numeric}; +use mz_repr::strconv::parse_numeric; + +/// Generate one operand from the fuzzer's byte stream. +/// +/// Half the time we parse decimal text (reaching NaN/Inf/scientific notation), +/// the other half we build `coeff * 10^exponent` directly so a wide coefficient +/// can be placed at any scale. +fn gen_operand(u: &mut Unstructured) -> arbitrary::Result> { + if u.int_in_range(0u8..=1)? == 0 { + // Parsed text. Borrow the rest of the buffer as UTF-8-ish text. On + // failure fall back to a short arbitrary string. + let s: &str = u.arbitrary()?; + Ok(parse_numeric(s).ok().map(|d| d.0)) + } else { + let coeff: i128 = u.arbitrary()?; + // Bound the shift to bracket the datum context's [-39, 38] exponent + // window without forcing every value outside it. + let exponent: i16 = u.int_in_range(-60i16..=60)?; + let mut cx = cx_datum(); + let mut n = cx.from_i128(coeff); + let shift = Numeric::from(i32::from(exponent)); + // `scaleb` respects the context (clamps/flags rather than panicking). + cx.scaleb(&mut n, &shift); + // A finite value that overflowed to infinity isn't an interesting "raw" + // operand. Drop it so special-value handling has a clear contract. + if n.is_infinite() { + Ok(None) + } else { + Ok(Some(n)) + } + } +} + +/// Was the operation that just ran in `cx` exact (no rounding, overflow, or +/// invalid operation)? Algebraic identities only hold when exact. +fn op_exact(cx: &dec::Context) -> bool { + let s = cx.status(); + !s.inexact() && !s.invalid_operation() && !s.overflow() +} + +fn check(a: Numeric, b: Numeric) { + // NaN never equals itself, so equality-based oracles can't be asserted. + if a.is_nan() || b.is_nan() { + // Still exercise every operation for panics. + let mut cx = cx_datum(); + let mut t = a; + cx.add(&mut t, &b); + cx.mul(&mut t, &b); + cx.sub(&mut t, &b); + cx.div(&mut t, &b); + cx.rem(&mut t, &b); + cx.pow(&mut t, &b); + cx.ln(&mut t); + cx.log10(&mut t); + let mut s = a; + cx.sqrt(&mut s); + cx.round(&mut t); + return; + } + + let mut cx = cx_datum(); + + // --- Commutativity ----------------------------------------------------- + let (mut ab, mut ba) = (a, b); + cx.add(&mut ab, &b); + cx.add(&mut ba, &a); + assert_eq!(ab, ba, "numeric add is not commutative"); + + let (mut am, mut bm) = (a, b); + cx.mul(&mut am, &b); + cx.mul(&mut bm, &a); + assert_eq!(am, bm, "numeric mul is not commutative"); + + // --- Additive inverse: a + (-a) == 0 (exact for all finite a) ---------- + { + let mut neg_a = a; + cx.neg(&mut neg_a); + let mut sum = a; + cx.clear_status(); + cx.add(&mut sum, &neg_a); + if op_exact(&cx) { + assert!(sum.is_zero(), "a + (-a) must be zero, got {}", sum); + } + } + + // --- Identities a + 0 == a, a * 1 == a --------------------------------- + { + let zero = Numeric::from(0); + let mut t = a; + cx.clear_status(); + cx.add(&mut t, &zero); + if op_exact(&cx) { + assert_eq!(t, a, "a + 0 must equal a"); + } + + let one = Numeric::from(1); + let mut t = a; + cx.clear_status(); + cx.mul(&mut t, &one); + if op_exact(&cx) { + assert_eq!(t, a, "a * 1 must equal a"); + } + } + + // --- Cancellation: (a + b) - b == a, only when both ops were exact ----- + { + cx.clear_status(); + let mut s = a; + cx.add(&mut s, &b); + let sum_exact = op_exact(&cx); + cx.clear_status(); + cx.sub(&mut s, &b); + let sub_exact = op_exact(&cx); + if sum_exact && sub_exact { + assert_eq!(s, a, "(a + b) - b must equal a when exact"); + } + } + + // --- Associativity of add: (a+b)+a vs a+(b+a) when exact --------------- + // Re-using `a` as the third operand keeps the target's input binary while + // still re-associating across distinct magnitudes. + { + cx.clear_status(); + let mut left = a; + cx.add(&mut left, &b); + cx.add(&mut left, &a); + let left_exact = op_exact(&cx); + + cx.clear_status(); + let mut bc = b; + cx.add(&mut bc, &a); + let mut right = a; + cx.add(&mut right, &bc); + let right_exact = op_exact(&cx); + + if left_exact && right_exact { + assert_eq!(left, right, "numeric add is not associative (exact)"); + } + } + + // --- Associativity of mul: (a*b)*a vs a*(b*a) when exact --------------- + { + cx.clear_status(); + let mut left = a; + cx.mul(&mut left, &b); + cx.mul(&mut left, &a); + let left_exact = op_exact(&cx); + + cx.clear_status(); + let mut bc = b; + cx.mul(&mut bc, &a); + let mut right = a; + cx.mul(&mut right, &bc); + let right_exact = op_exact(&cx); + + if left_exact && right_exact { + assert_eq!(left, right, "numeric mul is not associative (exact)"); + } + } + + // --- No-panic exercise of the remaining ops, plus sign properties ------ + let mut s = a; + cx.sub(&mut s, &b); + let mut q = a; + cx.div(&mut q, &b); // includes divide-by-zero + let mut r = a; + cx.rem(&mut r, &b); // includes rem-by-zero + let mut p = a; + cx.pow(&mut p, &b); + + // sqrt of a non-negative finite value is non-negative. + { + let mut x = a; + cx.clear_status(); + cx.sqrt(&mut x); + if !a.is_negative() && !x.is_nan() && !x.is_infinite() { + assert!( + !x.is_negative(), + "sqrt of non-negative must be non-negative" + ); + } + } + + // abs is never negative (modulo NaN, already excluded). + { + let mut x = a; + cx.abs(&mut x); + if !x.is_nan() { + assert!(!x.is_negative(), "abs must be non-negative"); + } + } + + // ln / log10 / round / rescale just must not panic. + let mut l = a; + cx.ln(&mut l); + let mut l = a; + cx.log10(&mut l); + let mut rd = a; + cx.round(&mut rd); + let mut rsc = a; + let _ = mz_repr::adt::numeric::rescale(&mut rsc, (b.exponent().unsigned_abs() % 40) as u8); +} + +fn run(mut u: Unstructured) -> arbitrary::Result<()> { + let a = gen_operand(&mut u)?; + let b = gen_operand(&mut u)?; + if let (Some(a), Some(b)) = (a, b) { + check(a, b); + } + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let _ = run(Unstructured::new(data)); +}); diff --git a/src/repr/fuzz/fuzz_targets/range_ops.rs b/src/repr/fuzz/fuzz_targets/range_ops.rs new file mode 100644 index 0000000000000..a280ca311dbd9 --- /dev/null +++ b/src/repr/fuzz/fuzz_targets/range_ops.rs @@ -0,0 +1,146 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `Range` set operations over `int4` bounds. Ranges are built +//! from user input and canonicalized, then unioned/intersected/differenced. +//! None of this may panic, and the results must satisfy set algebra. Bounds are +//! all `int4` so the documented "finite bounds of different types" precondition +//! panic in `canonicalize` stays unreachable. +//! +//! Oracles (beyond no-panic): +//! * union (when representable as one range) contains both operands. Union is +//! commutative and idempotent (`a ∪ a == a`); +//! * intersection is contained in both operands. Intersection is commutative +//! and idempotent (`a ∩ a == a`); +//! * difference algebra: when `a ∖ b` is representable, it is a subset of `a` +//! and is disjoint from `b` (empty intersection with `b`); +//! * `contains_elem(x)` agrees with "intersecting the point range `[x,x]` is +//! non-empty". + +#![no_main] + +use libfuzzer_sys::arbitrary::{self, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_repr::adt::range::{Range, RangeBound}; +use mz_repr::Datum; + +// None = infinite bound. Some((inclusive, value)) = finite bound. +type BoundSpec = Option<(bool, i32)>; +// None = empty range. Some((lower, upper)) = a range with those bounds. +type RangeSpec = Option<(BoundSpec, BoundSpec)>; + +fn make_range(spec: RangeSpec) -> Range> { + let inner = spec.map(|(lo, hi)| { + let lower = RangeBound { + inclusive: matches!(lo, Some((true, _))), + bound: lo.map(|(_, v)| Datum::Int32(v)), + }; + let upper = RangeBound { + inclusive: matches!(hi, Some((true, _))), + bound: hi.map(|(_, v)| Datum::Int32(v)), + }; + (lower, upper) + }); + Range::new(inner) +} + +/// Build a canonical operand, or `None` if the spec doesn't canonicalize +/// (misordered bounds, etc.). +fn canonical(spec: RangeSpec) -> Option>> { + let mut r = make_range(spec); + r.canonicalize().ok()?; + Some(r) +} + +/// Re-anchor a `Range>` (e.g. a `difference` result) to `'static`. +/// All bounds here are `int4`, which own no borrowed data, so this is just a +/// lifetime launder. +fn to_static(r: Range>) -> Range> { + r.into_bounds(|d| match d { + Datum::Int32(v) => Datum::Int32(v), + other => unreachable!("non-int4 bound in int4 range: {other:?}"), + }) +} + +fn check(a: Range>, b: Range>, elem: i32) { + let _ = a.contains_elem(&elem); + + // --- Union: contains both operands, commutative, idempotent. ---------- + if let Ok(union) = a.union(&b) { + assert!( + union.contains_range(&a) && union.contains_range(&b), + "union must contain both operands" + ); + } + assert_eq!( + a.union(&b), + b.union(&a), + "union must be commutative" + ); + if let Ok(uaa) = a.union(&a) { + assert_eq!(uaa, a, "union must be idempotent (a ∪ a == a)"); + } + + // --- Intersection: contained in both, commutative, idempotent. -------- + let intersection = a.intersection(&b); + assert!( + a.contains_range(&intersection) && b.contains_range(&intersection), + "intersection must be contained in both operands" + ); + assert_eq!( + a.intersection(&b), + b.intersection(&a), + "intersection must be commutative" + ); + assert_eq!( + a.intersection(&a), + a, + "intersection must be idempotent (a ∩ a == a)" + ); + + // --- Difference algebra: a ∖ b ⊆ a and disjoint from b. --------------- + if let Ok(diff) = a.difference(&b) { + let diff = to_static(diff); + assert!( + a.contains_range(&diff), + "a ∖ b must be a subset of a" + ); + assert!( + diff.intersection(&b).inner.is_none(), + "a ∖ b must be disjoint from b" + ); + } + + // --- contains_elem(x) consistency vs the point range [x, x]. ---------- + // `[x, x]` canonicalizes to the singleton {x}. If `x == i32::MAX` the upper + // bound's `step` overflows and canonicalization fails, in which case we + // simply skip the comparison. + if let Some(point) = canonical(Some((Some((true, elem)), Some((true, elem))))) { + let contains = a.contains_elem(&elem); + let intersects = a.intersection(&point).inner.is_some(); + assert_eq!( + contains, intersects, + "contains_elem({elem}) must agree with [x,x] intersection" + ); + } +} + +fn run(mut u: Unstructured) -> arbitrary::Result<()> { + let ra: RangeSpec = u.arbitrary()?; + let rb: RangeSpec = u.arbitrary()?; + let elem: i32 = u.arbitrary()?; + if let (Some(a), Some(b)) = (canonical(ra), canonical(rb)) { + check(a, b, elem); + } + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let _ = run(Unstructured::new(data)); +}); diff --git a/src/repr/fuzz/fuzz_targets/relation_desc_proto_roundtrip.rs b/src/repr/fuzz/fuzz_targets/relation_desc_proto_roundtrip.rs new file mode 100644 index 0000000000000..c967acdbfa5e6 --- /dev/null +++ b/src/repr/fuzz/fuzz_targets/relation_desc_proto_roundtrip.rs @@ -0,0 +1,83 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `RelationDesc` proto round-trips losslessly. +//! `RelationDesc` is the schema of every persisted collection, so a decoder +//! bug here corrupts catalog/persist state. +//! +//! Two arms (the first byte selects): +//! - Arbitrary arm: drive `RelationDesc`'s proptest `Arbitrary` strategy from +//! the fuzzer bytes to build a *valid* desc: column names paired with +//! deeply-nested `SqlColumnType`s, where the names<->metadata length +//! invariant and the per-version migration metadata are well-formed. Assert +//! `from_proto(into_proto(v)) == v`. Random proto bytes never satisfy the +//! length invariant, so the encoder's rollup/migration-default paths are +//! only reached here. +//! - Raw-bytes arm: decode arbitrary bytes as `ProtoRelationDesc`, into Rust, +//! and re-encode, keeping coverage of the bare wire decoder against hostile +//! input (including descs that violate the length invariant on decode). + +#![no_main] + +use libfuzzer_sys::fuzz_target; +use mz_proto::{ProtoType, RustType}; +use mz_repr::{ProtoRelationDesc, RelationDesc}; +use proptest::strategy::{Strategy, ValueTree}; +use proptest::test_runner::{Config, RngAlgorithm, TestRng, TestRunner}; +use prost::Message; + +fn arbitrary_arm(seed: &[u8]) { + let mut buf = [0u8; 32]; + for (dst, src) in buf.iter_mut().zip(seed.iter()) { + *dst = *src; + } + let rng = TestRng::from_seed(RngAlgorithm::ChaCha, &buf); + let mut runner = TestRunner::new_with_rng(Config::default(), rng); + let value = match ::arbitrary() + .new_tree(&mut runner) + { + Ok(tree) => tree.current(), + Err(_) => return, + }; + + let proto = value.into_proto(); + let back = RelationDesc::from_proto(proto).expect("valid RelationDesc must round-trip"); + assert_eq!(value, back, "RelationDesc changed across proto roundtrip"); +} + +fn raw_arm(data: &[u8]) { + let Ok(proto) = ProtoRelationDesc::decode(data) else { + return; + }; + let orig: RelationDesc = match proto.into_rust() { + Ok(v) => v, + Err(_) => return, + }; + + let proto2 = >::from_rust(&orig); + let bytes2 = proto2.encode_to_vec(); + let proto3 = ProtoRelationDesc::decode(bytes2.as_slice()) + .expect("re-encode of valid RelationDesc must decode"); + let round: RelationDesc = proto3 + .into_rust() + .expect("re-encoded RelationDesc must convert back to Rust"); + + assert_eq!(orig, round, "RelationDesc changed across proto roundtrip"); +} + +fuzz_target!(|data: &[u8]| { + let Some((&mode, rest)) = data.split_first() else { + return; + }; + if mode & 1 == 0 { + arbitrary_arm(rest); + } else { + raw_arm(rest); + } +}); diff --git a/src/repr/fuzz/fuzz_targets/row_arrow_roundtrip.rs b/src/repr/fuzz/fuzz_targets/row_arrow_roundtrip.rs new file mode 100644 index 0000000000000..76e7051dab274 --- /dev/null +++ b/src/repr/fuzz/fuzz_targets/row_arrow_roundtrip.rs @@ -0,0 +1,491 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: a `Row` survives persist's columnar Arrow encode/decode round +//! trip. `RelationDesc`'s `Schema` impl encodes rows into Arrow arrays +//! (the on-disk persist format) and decodes them back. A mismatch is silent +//! data corruption. This is coverage-guided, complementing the existing +//! `proptest_non_empty_relation_descs` property test. +//! +//! The generator focuses on the scalar types with non-trivial packed encodings +//! (PackedNumeric, PackedNaiveDateTime, PackedInterval, PackedNaiveTime, ...), +//! plus a few shapes that exercise interactions the bare-scalar columns can't: +//! +//! * `Char { length }` / `VarChar { max_length }`: these decode through the +//! same Arrow `StringArray` as plain strings, but carry a length parameter +//! in the schema. We generate the parameter independently of the stored +//! string to stress the schema/value split. +//! * `Numeric { max_scale }` with the special values `NaN`, `Infinity`, and +//! `-Infinity` (round-tripped via `PackedNumeric`'s BCD bytes), in addition +//! to ordinary finite decimals, and a randomized `max_scale` in the schema. +//! * Shallow composites (`List`/`Array`/`Map` of a scalar element and +//! `Range` of a discrete scalar), placed alongside the packed-scalar columns +//! so the multi-column encoder builds nested Arrow arrays next to the flat +//! ones. The composite element is always a simple (non-composite) scalar to +//! keep nesting shallow. Deeper nesting already has proptest coverage. +//! +//! Leap seconds are excluded (a separate known encoding gap). + +#![no_main] + +use std::borrow::Borrow; + +use chrono::{DateTime, NaiveTime, Utc}; +use libfuzzer_sys::arbitrary::{self, Arbitrary, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_persist_types::columnar::{ColumnDecoder, ColumnEncoder, Schema}; +use mz_repr::adt::array::ArrayDimension; +use mz_repr::adt::char::CharLength; +use mz_repr::adt::date::Date; +use mz_repr::adt::interval::Interval; +use mz_repr::adt::numeric::{cx_datum, Numeric, NumericMaxScale}; +use mz_repr::adt::range::{Range, RangeBound, RangeLowerBound, RangeUpperBound}; +use mz_repr::adt::timestamp::CheckedTimestamp; +use mz_repr::adt::varchar::VarCharMaxLength; +use mz_repr::strconv::parse_numeric; +use mz_repr::{Datum, RelationDesc, Row, SqlColumnType, SqlScalarType, Timestamp}; + +/// Generate a "simple" scalar type: one that maps to a single flat Arrow array +/// and can serve as a composite element type (no nesting). +fn gen_simple_scalar_type(u: &mut Unstructured) -> arbitrary::Result { + Ok(match u.int_in_range(0u8..=20)? { + 0 => SqlScalarType::Bool, + 1 => SqlScalarType::Int16, + 2 => SqlScalarType::Int32, + 3 => SqlScalarType::Int64, + 4 => SqlScalarType::UInt16, + 5 => SqlScalarType::UInt32, + 6 => SqlScalarType::UInt64, + 7 => SqlScalarType::PgLegacyChar, // a u8 datum + 8 => SqlScalarType::Float32, + 9 => SqlScalarType::Float64, + 10 => SqlScalarType::String, + 11 => SqlScalarType::Bytes, + 12 => SqlScalarType::Numeric { + max_scale: gen_max_scale(u)?, + }, + 13 => SqlScalarType::Date, + 14 => SqlScalarType::Time, + 15 => SqlScalarType::Timestamp { precision: None }, + 16 => SqlScalarType::TimestampTz { precision: None }, + 17 => SqlScalarType::Interval, + 18 => SqlScalarType::MzTimestamp, + 19 => SqlScalarType::Char { + length: gen_char_length(u)?, + }, + _ => SqlScalarType::VarChar { + max_length: gen_varchar_max_length(u)?, + }, + }) +} + +/// Generate any column scalar type: a simple scalar, or a shallow composite +/// (`List`/`Array`/`Map` of a simple scalar, or `Range` of a discrete scalar). +fn gen_scalar_type(u: &mut Unstructured) -> arbitrary::Result { + // Bias toward simple scalars (which carry the packed encodings we mostly + // care about) but reach the composites a meaningful fraction of the time. + Ok(match u.int_in_range(0u8..=11)? { + 0..=7 => gen_simple_scalar_type(u)?, + 8 => SqlScalarType::List { + element_type: Box::new(gen_element_type(u)?), + custom_id: None, + }, + 9 => SqlScalarType::Array(Box::new(gen_element_type(u)?)), + 10 => SqlScalarType::Map { + value_type: Box::new(gen_element_type(u)?), + custom_id: None, + }, + _ => SqlScalarType::Range { + element_type: Box::new(gen_range_element_type(u)?), + }, + }) +} + +/// Composite element types. Restricted to scalars whose `Datum` is `Copy` (i.e. +/// carries no borrow into the input buffer) so we can collect element datums +/// into a `Vec` and push them without leaking backing storage. This still +/// exercises the nested Arrow encoders for the packed-scalar element types. +fn gen_element_type(u: &mut Unstructured) -> arbitrary::Result { + Ok(match u.int_in_range(0u8..=11)? { + 0 => SqlScalarType::Bool, + 1 => SqlScalarType::Int16, + 2 => SqlScalarType::Int32, + 3 => SqlScalarType::Int64, + 4 => SqlScalarType::UInt32, + 5 => SqlScalarType::UInt64, + 6 => SqlScalarType::Float64, + 7 => SqlScalarType::Numeric { + max_scale: gen_max_scale(u)?, + }, + 8 => SqlScalarType::Date, + 9 => SqlScalarType::Time, + 10 => SqlScalarType::Timestamp { precision: None }, + _ => SqlScalarType::Interval, + }) +} + +/// Range element types are restricted to the discrete/continuous types that +/// `RangeBound::canonicalize` accepts. We use the two discrete integer types so +/// canonicalization is exercised without dragging in continuous-type quirks. +fn gen_range_element_type(u: &mut Unstructured) -> arbitrary::Result { + Ok(if bool::arbitrary(u)? { + SqlScalarType::Int32 + } else { + SqlScalarType::Int64 + }) +} + +/// A `max_scale` for `Numeric`: `None` half the time, otherwise an in-range +/// `0..=39` scale to stress the schema-side parameter. +fn gen_max_scale(u: &mut Unstructured) -> arbitrary::Result> { + if bool::arbitrary(u)? { + Ok(None) + } else { + let s = u.int_in_range(0i64..=39)?; + Ok(NumericMaxScale::try_from(s).ok()) + } +} + +/// A `Char` length: `None` (the "list element" form) some of the time, +/// otherwise a small positive length. +fn gen_char_length(u: &mut Unstructured) -> arbitrary::Result> { + if u.ratio(1u8, 4u8)? { + Ok(None) + } else { + let n = u.int_in_range(1i64..=300)?; + Ok(CharLength::try_from(n).ok()) + } +} + +/// A `VarChar` max length: `None` some of the time, otherwise a small positive +/// max length. +fn gen_varchar_max_length(u: &mut Unstructured) -> arbitrary::Result> { + if u.ratio(1u8, 4u8)? { + Ok(None) + } else { + let n = u.int_in_range(1i64..=300)?; + Ok(VarCharMaxLength::try_from(n).ok()) + } +} + +fn gen_naive_ts( + u: &mut Unstructured, +) -> arbitrary::Result> { + let secs = u.int_in_range(-8_000_000_000_000i64..=8_000_000_000_000)?; + let nanos = u.int_in_range(0u32..=999_999_999)?; // < 1s: no leap second + Ok(DateTime::from_timestamp(secs, nanos) + .and_then(|d| CheckedTimestamp::from_timestamplike(d.naive_utc()).ok()) + .unwrap_or_else(|| { + CheckedTimestamp::from_timestamplike( + DateTime::from_timestamp(0, 0).unwrap().naive_utc(), + ) + .unwrap() + })) +} + +fn gen_utc_ts(u: &mut Unstructured) -> arbitrary::Result>> { + let secs = u.int_in_range(-8_000_000_000_000i64..=8_000_000_000_000)?; + let nanos = u.int_in_range(0u32..=999_999_999)?; + Ok(DateTime::from_timestamp(secs, nanos) + .and_then(|d| CheckedTimestamp::from_timestamplike(d).ok()) + .unwrap_or_else(|| { + CheckedTimestamp::from_timestamplike(DateTime::from_timestamp(0, 0).unwrap()).unwrap() + })) +} + +/// Generate a `Numeric` datum value, reaching the special values that an Arrow +/// `PackedNumeric` must round-trip (`NaN`, `±Infinity`) as well as ordinary +/// finite decimals. +/// +/// `parse_numeric` deliberately rejects directly-typed `NaN`/`Infinity` (it +/// only allows infinities that arose from overflow), so the special values are +/// constructed straight from the `dec` context instead of via text. +fn gen_numeric(u: &mut Unstructured) -> arbitrary::Result> { + let n = match u.int_in_range(0u8..=5)? { + 0 => Numeric::nan(), + 1 => Numeric::infinity(), + 2 => { + let mut cx = cx_datum(); + let mut n = Numeric::infinity(); + cx.neg(&mut n); + n + } + _ => { + let s = format!("{}.{}", i64::arbitrary(u)?, u32::arbitrary(u)?); + return Ok(parse_numeric(&s).unwrap_or_else(|_| parse_numeric("0").unwrap())); + } + }; + Ok(dec::OrderedDecimal(n)) +} + +/// Push one datum of the given type (or NULL) into the row. Byte reads use `?`. +/// Out-of-range constructed values fall back to a valid datum so the row always +/// matches the column type. +fn push_datum( + packer: &mut mz_repr::RowPacker, + u: &mut Unstructured, + ty: &SqlScalarType, + nullable: bool, +) -> arbitrary::Result<()> { + if nullable && u.ratio(1u8, 8u8)? { + packer.push(Datum::Null); + return Ok(()); + } + match ty { + SqlScalarType::Bool => packer.push(if bool::arbitrary(u)? { + Datum::True + } else { + Datum::False + }), + SqlScalarType::Int16 => packer.push(Datum::Int16(i16::arbitrary(u)?)), + SqlScalarType::Int32 => packer.push(Datum::Int32(i32::arbitrary(u)?)), + SqlScalarType::Int64 => packer.push(Datum::Int64(i64::arbitrary(u)?)), + SqlScalarType::UInt16 => packer.push(Datum::UInt16(u16::arbitrary(u)?)), + SqlScalarType::UInt32 => packer.push(Datum::UInt32(u32::arbitrary(u)?)), + SqlScalarType::UInt64 => packer.push(Datum::UInt64(u64::arbitrary(u)?)), + SqlScalarType::PgLegacyChar => packer.push(Datum::UInt8(u8::arbitrary(u)?)), + SqlScalarType::Float32 => packer.push(Datum::Float32(f32::arbitrary(u)?.into())), + SqlScalarType::Float64 => packer.push(Datum::Float64(f64::arbitrary(u)?.into())), + SqlScalarType::String => packer.push(Datum::String(<&str>::arbitrary(u)?)), + // Char/VarChar are stored as `Datum::String` and encode through the + // same Arrow `StringArray`. The schema-side length parameter does not + // truncate/pad here, so any string round-trips. + SqlScalarType::Char { .. } | SqlScalarType::VarChar { .. } => { + packer.push(Datum::String(<&str>::arbitrary(u)?)) + } + SqlScalarType::Bytes => packer.push(Datum::Bytes(<&[u8]>::arbitrary(u)?)), + SqlScalarType::Numeric { .. } => { + packer.push(Datum::Numeric(gen_numeric(u)?)); + } + SqlScalarType::Date => { + let d = Date::from_pg_epoch(i32::arbitrary(u)?) + .unwrap_or_else(|_| Date::from_pg_epoch(0).unwrap()); + packer.push(Datum::Date(d)); + } + SqlScalarType::Time => { + let secs = u.int_in_range(0u32..=86_399)?; + let nanos = u.int_in_range(0u32..=999_999_999)?; + let t = NaiveTime::from_num_seconds_from_midnight_opt(secs, nanos).unwrap(); + packer.push(Datum::Time(t)); + } + SqlScalarType::Timestamp { .. } => packer.push(Datum::Timestamp(gen_naive_ts(u)?)), + SqlScalarType::TimestampTz { .. } => packer.push(Datum::TimestampTz(gen_utc_ts(u)?)), + SqlScalarType::Interval => packer.push(Datum::Interval(Interval::new( + i32::arbitrary(u)?, + i32::arbitrary(u)?, + i64::arbitrary(u)?, + ))), + SqlScalarType::MzTimestamp => { + packer.push(Datum::MzTimestamp(Timestamp::from(u64::arbitrary(u)?))) + } + // A `List` of `element_type`: a small run of element datums (each of + // which may itself be NULL since list elements are always nullable). + SqlScalarType::List { element_type, .. } => { + let elems = gen_scalar_datums(u, element_type)?; + packer.push_list(elems.iter().map(Borrow::borrow)); + } + // A 1-D `Array` of `element_type`. The single dimension's length must + // match the number of pushed elements. `lower_bound` is arbitrary. + SqlScalarType::Array(element_type) => { + let elems = gen_scalar_datums(u, element_type)?; + let dims = if elems.is_empty() { + vec![] + } else { + vec![ArrayDimension { + lower_bound: 1, + length: elems.len(), + }] + }; + // The element count always matches `dims`, so this can't fail. Fall + // back to an empty array out of an abundance of caution. + if packer + .try_push_array(&dims, elems.iter().map(Borrow::borrow)) + .is_err() + { + packer.try_push_array(&[], std::iter::empty::()).unwrap(); + } + } + // A `Map` of string keys to `value_type` values. Keys must be unique + // and sorted, which `push_dict` does not enforce, so we dedup/sort here. + SqlScalarType::Map { value_type, .. } => { + let n = u.int_in_range(0usize..=6)?; + let mut entries: Vec<(String, Datum)> = Vec::with_capacity(n); + for _ in 0..n { + let k = String::arbitrary(u)?; + entries.push((k, gen_scalar_datum(u, value_type)?)); + } + entries.sort_by(|a, b| a.0.cmp(&b.0)); + entries.dedup_by(|a, b| a.0 == b.0); + packer.push_dict(entries.iter().map(|(k, v)| (k.as_str(), v))); + } + // A `Range` over a discrete element type. Bounds are independently + // finite/infinite and inclusive/exclusive. `push_range` canonicalizes. + SqlScalarType::Range { element_type } => { + push_range(packer, u, element_type)?; + } + // gen_scalar_type only produces the types above. + _ => packer.push(Datum::Null), + } + Ok(()) +} + +/// Generate a small vector of owned datums of a simple scalar type, for use as +/// the elements of a list/array. Elements may be NULL. +fn gen_scalar_datums<'a>( + u: &mut Unstructured, + ty: &SqlScalarType, +) -> arbitrary::Result>> { + let n = u.int_in_range(0usize..=6)?; + let mut out = Vec::with_capacity(n); + for _ in 0..n { + if u.ratio(1u8, 8u8)? { + out.push(Datum::Null); + } else { + out.push(gen_scalar_datum(u, ty)?); + } + } + Ok(out) +} + +/// Generate a single owned (non-null) datum for a composite element type. +/// Only the `Copy` `Datum` variants (those produced by `gen_element_type` / +/// `gen_range_element_type`) are handled, so the returned `Datum` carries no +/// borrow and the caller can collect it into a `Vec`. +fn gen_scalar_datum<'a>( + u: &mut Unstructured, + ty: &SqlScalarType, +) -> arbitrary::Result> { + Ok(match ty { + SqlScalarType::Bool => { + if bool::arbitrary(u)? { + Datum::True + } else { + Datum::False + } + } + SqlScalarType::Int16 => Datum::Int16(i16::arbitrary(u)?), + SqlScalarType::Int32 => Datum::Int32(i32::arbitrary(u)?), + SqlScalarType::Int64 => Datum::Int64(i64::arbitrary(u)?), + SqlScalarType::UInt32 => Datum::UInt32(u32::arbitrary(u)?), + SqlScalarType::UInt64 => Datum::UInt64(u64::arbitrary(u)?), + SqlScalarType::Float64 => Datum::Float64(f64::arbitrary(u)?.into()), + SqlScalarType::Numeric { .. } => Datum::Numeric(gen_numeric(u)?), + SqlScalarType::Date => Datum::Date( + Date::from_pg_epoch(i32::arbitrary(u)?).unwrap_or_else(|_| Date::from_pg_epoch(0).unwrap()), + ), + SqlScalarType::Time => { + let secs = u.int_in_range(0u32..=86_399)?; + let nanos = u.int_in_range(0u32..=999_999_999)?; + Datum::Time(NaiveTime::from_num_seconds_from_midnight_opt(secs, nanos).unwrap()) + } + SqlScalarType::Timestamp { .. } => Datum::Timestamp(gen_naive_ts(u)?), + SqlScalarType::TimestampTz { .. } => Datum::TimestampTz(gen_utc_ts(u)?), + SqlScalarType::Interval => Datum::Interval(Interval::new( + i32::arbitrary(u)?, + i32::arbitrary(u)?, + i64::arbitrary(u)?, + )), + SqlScalarType::MzTimestamp => Datum::MzTimestamp(Timestamp::from(u64::arbitrary(u)?)), + // gen_simple_scalar_type only produces the types above. + _ => Datum::Null, + }) +} + +/// Push a `Range` of the given discrete element type. Each bound is +/// independently infinite (`Datum::Null` => infinite via `RangeBound::new`) or +/// a finite element value, and independently inclusive/exclusive. Empty ranges +/// are reached when `push_range`'s canonicalization collapses the bounds. +fn push_range( + packer: &mut mz_repr::RowPacker, + u: &mut Unstructured, + element_type: &SqlScalarType, +) -> arbitrary::Result<()> { + // ~1/8 of the time emit the explicitly-empty range. + if u.ratio(1u8, 8u8)? { + let _ = packer.push_range(Range { inner: None }); + return Ok(()); + } + let lower_d = if bool::arbitrary(u)? { + Datum::Null + } else { + gen_scalar_datum(u, element_type)? + }; + let upper_d = if bool::arbitrary(u)? { + Datum::Null + } else { + gen_scalar_datum(u, element_type)? + }; + let lower: RangeLowerBound = RangeBound::new(lower_d, bool::arbitrary(u)?); + let upper: RangeUpperBound = RangeBound::new(upper_d, bool::arbitrary(u)?); + // An out-of-order range (lower > upper) is an `InvalidRangeError`. On + // failure just fall back to the empty range so the column stays valid. + if packer + .push_range(Range::new(Some((lower, upper)))) + .is_err() + { + let _ = packer.push_range(Range { inner: None }); + } + Ok(()) +} + +fn run(u: &mut Unstructured) -> arbitrary::Result<()> { + let ncols = u.int_in_range(1usize..=6)?; + let cols: Vec = (0..ncols) + .map(|_| { + Ok(SqlColumnType { + scalar_type: gen_scalar_type(u)?, + nullable: bool::arbitrary(u)?, + }) + }) + .collect::>()?; + + let mut builder = RelationDesc::builder(); + for (i, c) in cols.iter().enumerate() { + builder = builder.with_column(format!("c{i}"), c.clone()); + } + let desc = builder.finish(); + + let nrows = u.int_in_range(0usize..=10)?; + let mut rows = Vec::with_capacity(nrows); + for _ in 0..nrows { + let mut row = Row::default(); + { + let mut packer = row.packer(); + for c in &cols { + push_datum(&mut packer, u, &c.scalar_type, c.nullable)?; + } + } + rows.push(row); + } + + let Ok(mut encoder) = >::encoder(&desc) else { + return Ok(()); + }; + for row in &rows { + encoder.append(row); + } + let col = encoder.finish(); + let Ok(decoder) = >::decoder(&desc, col) else { + return Ok(()); + }; + for (i, orig) in rows.iter().enumerate() { + let mut out = Row::default(); + decoder.decode(i, &mut out); + assert_eq!( + orig, &out, + "Row changed across Arrow columnar round trip (desc = {desc:?})" + ); + } + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let mut u = Unstructured::new(data); + let _ = run(&mut u); +}); diff --git a/src/repr/fuzz/fuzz_targets/row_codec_roundtrip.rs b/src/repr/fuzz/fuzz_targets/row_codec_roundtrip.rs new file mode 100644 index 0000000000000..ddec355bf8efa --- /dev/null +++ b/src/repr/fuzz/fuzz_targets/row_codec_roundtrip.rs @@ -0,0 +1,95 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `Row`'s `Codec` impl (the persistence wrapper around +//! `ProtoRow`) round-trips losslessly. This goes through `Codec::decode + +//! Codec::encode`, which is the path persist actually uses on read/write. It is +//! distinct from the bare `ProtoRow` path in `row_proto_roundtrip` because it +//! threads the `RelationDesc` schema through. +//! +//! The schema is the whole point of this target, so we decode against a +//! NON-empty, randomized `RelationDesc` rather than `RelationDesc::empty()`. +//! With the empty schema the schema-directed decode loop +//! (`Row::decode_from_proto`, which iterates `desc.iter_all()` and pulls each +//! column's proto datum, padding missing columns with `Datum::Null`) never +//! runs. It produces an empty `Row` for any input, so the round trip is +//! trivially satisfied and the schema threading this target tests goes +//! unexercised. +//! +//! Generation: the first few input bytes pick a column count and a column type +//! per column (the type menu mirrors the scalar shapes persist stores). The +//! remaining bytes are the encoded `ProtoRow` body. Note `decode_from_proto` +//! does not type-check the proto datums against the column types, so the schema +//! controls the column *count* and the per-column iteration / null-padding, +//! which a non-empty schema exercises and the empty schema does not. The +//! invariant under test is that decode(encode(decode(bytes))) == decode(bytes) +//! for a fixed schema. + +#![no_main] + +use libfuzzer_sys::arbitrary::{Arbitrary, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_persist_types::Codec; +use mz_repr::{RelationDesc, Row, SqlColumnType, SqlScalarType}; + +/// Build a small, randomized non-empty `RelationDesc` from a prefix of the +/// input. The column types only affect the schema-directed decode via the +/// column *count* and iteration order (decode does not validate proto datums +/// against the types), but we still vary the types so the schema is realistic +/// and any type-sensitive code reachable through the desc gets exercised. +fn arb_relation_desc(u: &mut Unstructured) -> RelationDesc { + // 1..=8 columns. Fall back to a single column if the buffer is exhausted so + // we always test a genuinely non-empty schema. + let ncols = u.int_in_range(1usize..=8).unwrap_or(1); + let mut builder = RelationDesc::builder(); + for i in 0..ncols { + let scalar_type = arb_scalar_type(u); + let nullable = bool::arbitrary(u).unwrap_or(true); + builder = builder.with_column(format!("c{i}"), SqlColumnType { scalar_type, nullable }); + } + builder.finish() +} + +/// Pick a scalar type for a schema column. Defaults to `String` when the buffer +/// is exhausted. +fn arb_scalar_type(u: &mut Unstructured) -> SqlScalarType { + match u.int_in_range(0u8..=14).unwrap_or(0) { + 0 => SqlScalarType::Bool, + 1 => SqlScalarType::Int16, + 2 => SqlScalarType::Int32, + 3 => SqlScalarType::Int64, + 4 => SqlScalarType::UInt32, + 5 => SqlScalarType::UInt64, + 6 => SqlScalarType::Float32, + 7 => SqlScalarType::Float64, + 8 => SqlScalarType::String, + 9 => SqlScalarType::Bytes, + 10 => SqlScalarType::Date, + 11 => SqlScalarType::Time, + 12 => SqlScalarType::Timestamp { precision: None }, + 13 => SqlScalarType::Numeric { max_scale: None }, + _ => SqlScalarType::Jsonb, + } +} + +fuzz_target!(|data: &[u8]| { + let mut u = Unstructured::new(data); + // Derive the schema from the front of the buffer, then decode the rest as a + // `ProtoRow` against that non-empty schema. + let schema = arb_relation_desc(&mut u); + let rest = u.take_rest(); + + let Ok(orig) = Row::decode(rest, &schema) else { + return; + }; + let mut buf = Vec::new(); + orig.encode(&mut buf); + let round = Row::decode(&buf, &schema).expect("re-encode of a valid Row must decode"); + assert_eq!(orig, round, "Row changed across Codec roundtrip (schema = {schema:?})"); +}); diff --git a/src/repr/fuzz/fuzz_targets/row_proto_roundtrip.rs b/src/repr/fuzz/fuzz_targets/row_proto_roundtrip.rs new file mode 100644 index 0000000000000..4698cb43a6f76 --- /dev/null +++ b/src/repr/fuzz/fuzz_targets/row_proto_roundtrip.rs @@ -0,0 +1,41 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: every byte sequence that decodes as `ProtoRow` and then into +//! a Rust `Row` must survive a proto re-encode + re-decode with the same +//! value. `Row` is the core value representation in Materialize and the +//! proto wire format is part of the persist on-disk format, so any decoder +//! laxity that doesn't round-trip is a real correctness risk. + +#![no_main] + +use libfuzzer_sys::fuzz_target; +use mz_proto::ProtoType; +use mz_repr::{ProtoRow, Row}; +use prost::Message; + +fuzz_target!(|data: &[u8]| { + let Ok(proto) = ProtoRow::decode(data) else { + return; + }; + let orig: Row = match proto.into_rust() { + Ok(v) => v, + Err(_) => return, + }; + + let proto2 = >::from_rust(&orig); + let bytes2 = proto2.encode_to_vec(); + let proto3 = ProtoRow::decode(bytes2.as_slice()) + .expect("re-encode of valid Row must decode"); + let round: Row = proto3 + .into_rust() + .expect("re-encoded Row must convert back to Rust"); + + assert_eq!(orig, round, "Row changed across proto roundtrip"); +}); diff --git a/src/repr/fuzz/fuzz_targets/scalar_type_proto_roundtrip.rs b/src/repr/fuzz/fuzz_targets/scalar_type_proto_roundtrip.rs new file mode 100644 index 0000000000000..74f020306f4a7 --- /dev/null +++ b/src/repr/fuzz/fuzz_targets/scalar_type_proto_roundtrip.rs @@ -0,0 +1,82 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `SqlScalarType` proto round-trips losslessly. +//! `SqlScalarType` describes the type of every column in every relation, so +//! any decoder bug here propagates through the type system. +//! +//! Two arms (the first byte selects): +//! - Arbitrary arm: drive `SqlScalarType`'s proptest `Arbitrary` strategy from +//! the fuzzer bytes to build a *valid, deeply-nested* type (the recursive +//! `List`/`Map`/`Array`/`Record`/`Range` variants, boundary `max_scale` and +//! char/varchar `length`, custom OIDs, etc.) and assert +//! `from_proto(into_proto(v)) == v`. Random proto bytes almost never reach +//! these variants, so this arm is what actually exercises the encoder. +//! - Raw-bytes arm: decode arbitrary bytes as `ProtoScalarType`, into Rust, and +//! re-encode, keeping coverage of the bare wire decoder against hostile +//! input. + +#![no_main] + +use libfuzzer_sys::fuzz_target; +use mz_proto::{ProtoType, RustType}; +use mz_repr::{ProtoScalarType, SqlScalarType}; +use proptest::strategy::{Strategy, ValueTree}; +use proptest::test_runner::{Config, RngAlgorithm, TestRng, TestRunner}; +use prost::Message; + +fn arbitrary_arm(seed: &[u8]) { + let mut buf = [0u8; 32]; + for (dst, src) in buf.iter_mut().zip(seed.iter()) { + *dst = *src; + } + let rng = TestRng::from_seed(RngAlgorithm::ChaCha, &buf); + let mut runner = TestRunner::new_with_rng(Config::default(), rng); + let value = match ::arbitrary() + .new_tree(&mut runner) + { + Ok(tree) => tree.current(), + Err(_) => return, + }; + + let proto = value.into_proto(); + let back = SqlScalarType::from_proto(proto).expect("valid SqlScalarType must round-trip"); + assert_eq!(value, back, "SqlScalarType changed across proto roundtrip"); +} + +fn raw_arm(data: &[u8]) { + let Ok(proto) = ProtoScalarType::decode(data) else { + return; + }; + let orig: SqlScalarType = match proto.into_rust() { + Ok(v) => v, + Err(_) => return, + }; + + let proto2 = >::from_rust(&orig); + let bytes2 = proto2.encode_to_vec(); + let proto3 = ProtoScalarType::decode(bytes2.as_slice()) + .expect("re-encode of valid SqlScalarType must decode"); + let round: SqlScalarType = proto3 + .into_rust() + .expect("re-encoded SqlScalarType must convert back to Rust"); + + assert_eq!(orig, round, "SqlScalarType changed across proto roundtrip"); +} + +fuzz_target!(|data: &[u8]| { + let Some((&mode, rest)) = data.split_first() else { + return; + }; + if mode & 1 == 0 { + arbitrary_arm(rest); + } else { + raw_arm(rest); + } +}); diff --git a/src/repr/fuzz/fuzz_targets/strconv_parse_array.rs b/src/repr/fuzz/fuzz_targets/strconv_parse_array.rs new file mode 100644 index 0000000000000..1de027bf59736 --- /dev/null +++ b/src/repr/fuzz/fuzz_targets/strconv_parse_array.rs @@ -0,0 +1,92 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `strconv::parse_array` parses untrusted SQL array literal text +//! (`'{1,2,{3}}'::int[]`, COPY FROM) into elements + dimensions. It is a +//! hand-written, recursive parser over quoting/escaping and nested `{…}` +//! dimensions, a prime panic/OOM/stack-overflow surface. Element values are +//! ignored. We fuzz the structural parser. Must never panic. +//! +//! Random text almost never forms a balanced, properly-quoted `{…}` nesting, so +//! byte mutation barely reaches past the first dimension. We instead consume the +//! byte stream as grammar choices and emit a valid nested array literal, with +//! balanced braces, quoted elements with `\"`/`\\` escapes, and NULLs, so the +//! recursive dimension walk and the quote/escape state machine run deep. +//! A minority of inputs get structural noise spliced in (unbalanced braces, +//! stray quotes/backslashes) so the parser's error paths stay covered too. + +#![no_main] + +use libfuzzer_sys::arbitrary::{self, Unstructured}; +use libfuzzer_sys::fuzz_target; + +/// Emit one scalar element: an unquoted token, a quoted string (with escapes +/// and structural characters that *must* stay quoted), NULL, or empty. +fn push_elem(u: &mut Unstructured, out: &mut String) -> arbitrary::Result<()> { + match u.int_in_range(0u8..=3)? { + 0 => { + let n = u.int_in_range(0usize..=4)?; + for _ in 0..n { + out.push(*u.choose(&['a', 'b', '0', '1', '_', '-', '.'])?); + } + } + 1 => out.push_str("NULL"), + 2 => { + out.push('"'); + let n = u.int_in_range(0usize..=4)?; + for _ in 0..n { + match u.int_in_range(0u8..=4)? { + 0 => out.push_str("\\\""), + 1 => out.push_str("\\\\"), + _ => out.push(*u.choose(&['a', '1', ' ', '{', '}', ','])?), + } + } + out.push('"'); + } + _ => {} // empty element + } + Ok(()) +} + +fn gen_array(u: &mut Unstructured, depth: u32, out: &mut String) -> arbitrary::Result<()> { + out.push('{'); + let n = u.int_in_range(0usize..=3)?; + for i in 0..n { + if i > 0 { + out.push(','); + if u.int_in_range(0u8..=2)? == 0 { + out.push(' '); + } + } + if depth > 0 && u.int_in_range(0u8..=2)? == 0 { + gen_array(u, depth - 1, out)?; + } else { + push_elem(u, out)?; + } + } + out.push('}'); + Ok(()) +} + +fn run(mut u: Unstructured) -> arbitrary::Result<()> { + let mut s = String::new(); + gen_array(&mut u, 3, &mut s)?; + // 1-in-6: splice structural noise to exercise the error paths. + if u.int_in_range(0u8..=5)? == 0 { + s.push_str(u.choose(&["{", "}", ",", "\"", "\\", "{{", "}}", " ", "NULL"])?); + } + let _ = mz_repr::strconv::parse_array(&s, String::new, |e| { + Ok::<_, std::convert::Infallible>(e.into_owned()) + }); + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let _ = run(Unstructured::new(data)); +}); diff --git a/src/repr/fuzz/fuzz_targets/strconv_parse_bytes.rs b/src/repr/fuzz/fuzz_targets/strconv_parse_bytes.rs new file mode 100644 index 0000000000000..e74cef3bc73b8 --- /dev/null +++ b/src/repr/fuzz/fuzz_targets/strconv_parse_bytes.rs @@ -0,0 +1,121 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `strconv::parse_bytes` decodes untrusted `bytea` text (COPY +//! FROM, a text-format `bytea` parameter on the wire). It dispatches between the +//! hex form (`\x4a6b…`) and the hand-written "traditional" escape parser, which +//! walks bytes looking for `\\` and three-digit octal escapes `\NNN`. This is +//! exactly the byte-at-a-time escape-decoding shape that hid the CSV-decoder +//! bug. Must never panic on any input. +//! +//! Random text rarely lands on the `\x` prefix or a well-formed `\NNN` octal +//! triple, so byte mutation barely reaches either decoder. We instead consume +//! the byte stream as grammar choices and emit `bytea` text biased at the +//! decoders' edges: +//! * hex form: `\x` followed by hex-digit pairs, deliberately including +//! odd-length runs (the `OddLength` error), embedded whitespace (the +//! skip-whitespace arm), and stray non-hex digits; +//! * traditional form: octal escapes `\NNN` straddling the `\3xx`/`\4xx` +//! high-nibble boundary (only `\0..\3` lead bytes are valid), short octal +//! escapes, doubled `\\`, lone/trailing backslashes (the "ends with escape +//! character" arm), and raw literal bytes. +//! A quarter of inputs are still the raw bytes, so the dispatch and reject paths +//! stay covered. + +#![no_main] + +use libfuzzer_sys::arbitrary::{self, Unstructured}; +use libfuzzer_sys::fuzz_target; + +const HEX_DIGITS: &[char] = &[ + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'A', 'B', 'C', + 'D', 'E', 'F', +]; + +/// Octal lead digits, biased to straddle the `\0..\3` (valid) / `\4..\7` +/// (invalid high nibble) boundary that the traditional parser rejects. +const OCTAL_DIGITS: &[char] = &['0', '1', '2', '3', '4', '5', '6', '7']; + +/// Emit a hex-form `bytea`: `\x` then a run of hex digits, with the occasional +/// embedded whitespace (skipped) or stray non-hex byte. Odd-length runs hit the +/// `OddLength` error. +fn gen_hex(u: &mut Unstructured, out: &mut String) -> arbitrary::Result<()> { + out.push_str("\\x"); + let n = u.int_in_range(0usize..=12)?; + for _ in 0..n { + match u.int_in_range(0u8..=9)? { + 0 => out.push(*u.choose(&[' ', '\n', '\t', '\r'])?), + 1 => out.push(*u.choose(&['g', 'x', '_', '-'])?), + _ => out.push(*u.choose(HEX_DIGITS)?), + } + } + Ok(()) +} + +/// Emit traditional-form `bytea`: literal bytes interleaved with octal escapes +/// (`\NNN`, including the invalid `\4xx..\7xx` high nibble and short forms), +/// doubled backslashes, and lone/trailing backslashes. +fn gen_traditional(u: &mut Unstructured, out: &mut String) -> arbitrary::Result<()> { + let n = u.int_in_range(0usize..=10)?; + for _ in 0..n { + match u.int_in_range(0u8..=6)? { + // A full octal escape `\NNN`. + 0 | 1 => { + out.push('\\'); + for _ in 0..3 { + out.push(*u.choose(OCTAL_DIGITS)?); + } + } + // A short/partial octal escape (invalid: too few digits). + 2 => { + out.push('\\'); + for _ in 0..u.int_in_range(0usize..=2)? { + out.push(*u.choose(OCTAL_DIGITS)?); + } + } + // Doubled backslash (a literal `\`). + 3 => out.push_str("\\\\"), + // A lone backslash followed by a non-octal byte (invalid escape). + 4 => { + out.push('\\'); + out.push(*u.choose(&['x', '8', '9', 'a', ' ', '\\'])?); + } + // Raw literal bytes. + _ => out.push(*u.choose(&['a', 'Z', '0', ' ', '\n', '~'])?), + } + } + // Sometimes leave a trailing escape char (the "ends with escape" arm). + if u.int_in_range(0u8..=4)? == 0 { + out.push('\\'); + } + Ok(()) +} + +fn run(mut u: Unstructured) -> arbitrary::Result<()> { + // A quarter of the time, the raw bytes (must be UTF-8 for `parse_bytes`): + // keeps the dispatch and reject paths covered. + if u.int_in_range(0u8..=3)? == 0 { + if let Ok(s) = std::str::from_utf8(u.take_rest()) { + let _ = mz_repr::strconv::parse_bytes(s); + } + return Ok(()); + } + let mut s = String::new(); + if u.int_in_range(0u8..=1)? == 0 { + gen_hex(&mut u, &mut s)?; + } else { + gen_traditional(&mut u, &mut s)?; + } + let _ = mz_repr::strconv::parse_bytes(&s); + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let _ = run(Unstructured::new(data)); +}); diff --git a/src/repr/fuzz/fuzz_targets/strconv_parse_date.rs b/src/repr/fuzz/fuzz_targets/strconv_parse_date.rs new file mode 100644 index 0000000000000..08fad748842dc --- /dev/null +++ b/src/repr/fuzz/fuzz_targets/strconv_parse_date.rs @@ -0,0 +1,33 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `strconv::parse_date` parses untrusted DATE literal text. A +//! re-parseable rendering of a parsed value must yield the same value. + +#![no_main] + +use libfuzzer_sys::fuzz_target; +use mz_repr::strconv::{format_date, parse_date}; + +fuzz_target!(|data: &str| { + let Ok(d) = parse_date(data) else { + return; + }; + let mut buf = String::new(); + format_date(&mut buf, d); + // The renderer can emit text the parser rejects, a known cluster of + // date/time round-trip gaps (a leap second `:60` that PG carries, a + // >4-digit year, etc.), tracked separately, not panics. Tolerate those and + // only assert that a *re-parseable* rendering preserves the value (drift), + // and that nothing panics. + let Ok(reparsed) = parse_date(&buf) else { + return; + }; + assert_eq!(d, reparsed, "date changed across parse/format round trip"); +}); diff --git a/src/repr/fuzz/fuzz_targets/strconv_parse_interval.rs b/src/repr/fuzz/fuzz_targets/strconv_parse_interval.rs new file mode 100644 index 0000000000000..6ffdf6cc699cf --- /dev/null +++ b/src/repr/fuzz/fuzz_targets/strconv_parse_interval.rs @@ -0,0 +1,34 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `strconv::parse_interval` parses untrusted INTERVAL literal +//! text. It drives a complex datetime token state machine (the most intricate +//! parser in strconv). Beyond not panicking, its `Display` rendering must +//! re-parse to the same interval. + +#![no_main] + +use libfuzzer_sys::fuzz_target; +use mz_repr::strconv::parse_interval; + +fuzz_target!(|data: &str| { + let Ok(iv) = parse_interval(data) else { + return; + }; + let formatted = iv.to_string(); + // `Display` collapses sub-day time into one unbounded hours field, and + // re-parsing multiplies that hour count back out with checked arithmetic. A + // valid interval with micros near `i64::MAX` formats to an hour count whose + // re-parse overflows, so re-parse is not total. Tolerate the failure and + // only assert the round trip preserves the value when it does re-parse. + let Ok(reparsed) = parse_interval(&formatted) else { + return; + }; + assert_eq!(iv, reparsed, "interval changed across parse/format round trip"); +}); diff --git a/src/repr/fuzz/fuzz_targets/strconv_parse_list.rs b/src/repr/fuzz/fuzz_targets/strconv_parse_list.rs new file mode 100644 index 0000000000000..4e42ab7fc4c54 --- /dev/null +++ b/src/repr/fuzz/fuzz_targets/strconv_parse_list.rs @@ -0,0 +1,92 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `strconv::parse_list` parses untrusted SQL list literal text. +//! Recursive embedded-element lexing with quoting/escaping. Run both the +//! scalar-element and nested-list-element modes. Must never panic. +//! +//! Random text almost never forms a balanced, properly-quoted `{…}` list, so +//! byte mutation barely reaches past the opening brace. We instead consume the +//! byte stream as grammar choices and emit a valid nested list literal, with +//! balanced braces, quoted elements with `\"`/`\\` escapes, and NULLs, so the +//! recursive element lexer and its quote/escape state machine run deep, in both +//! the scalar-element and nested-list-element modes. A minority of inputs get +//! structural noise spliced in so the parser's error paths stay covered too. + +#![no_main] + +use libfuzzer_sys::arbitrary::{self, Unstructured}; +use libfuzzer_sys::fuzz_target; + +/// Emit one scalar element: an unquoted token, a quoted string (with escapes +/// and structural characters that *must* stay quoted), NULL, or empty. +fn push_elem(u: &mut Unstructured, out: &mut String) -> arbitrary::Result<()> { + match u.int_in_range(0u8..=3)? { + 0 => { + let n = u.int_in_range(0usize..=4)?; + for _ in 0..n { + out.push(*u.choose(&['a', 'b', '0', '1', '_', '-', '.'])?); + } + } + 1 => out.push_str("NULL"), + 2 => { + out.push('"'); + let n = u.int_in_range(0usize..=4)?; + for _ in 0..n { + match u.int_in_range(0u8..=4)? { + 0 => out.push_str("\\\""), + 1 => out.push_str("\\\\"), + _ => out.push(*u.choose(&['a', '1', ' ', '{', '}', ','])?), + } + } + out.push('"'); + } + _ => {} // empty element + } + Ok(()) +} + +fn gen_list(u: &mut Unstructured, depth: u32, out: &mut String) -> arbitrary::Result<()> { + out.push('{'); + let n = u.int_in_range(0usize..=3)?; + for i in 0..n { + if i > 0 { + out.push(','); + if u.int_in_range(0u8..=2)? == 0 { + out.push(' '); + } + } + if depth > 0 && u.int_in_range(0u8..=2)? == 0 { + gen_list(u, depth - 1, out)?; + } else { + push_elem(u, out)?; + } + } + out.push('}'); + Ok(()) +} + +fn run(mut u: Unstructured) -> arbitrary::Result<()> { + let mut s = String::new(); + gen_list(&mut u, 3, &mut s)?; + // 1-in-6: splice structural noise to exercise the error paths. + if u.int_in_range(0u8..=5)? == 0 { + s.push_str(u.choose(&["{", "}", ",", "\"", "\\", "{{", "}}", " ", "NULL"])?); + } + for is_element_type_list in [false, true] { + let _ = mz_repr::strconv::parse_list(&s, is_element_type_list, String::new, |e| { + Ok::<_, std::convert::Infallible>(e.into_owned()) + }); + } + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let _ = run(Unstructured::new(data)); +}); diff --git a/src/repr/fuzz/fuzz_targets/strconv_parse_map.rs b/src/repr/fuzz/fuzz_targets/strconv_parse_map.rs new file mode 100644 index 0000000000000..30edf3985235c --- /dev/null +++ b/src/repr/fuzz/fuzz_targets/strconv_parse_map.rs @@ -0,0 +1,91 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `strconv::parse_map` parses untrusted SQL map literal text +//! (`'{k=>v,…}'`). Key/value lexing with quoting/escaping and embedded maps. +//! Run both the scalar-value and nested-map-value modes. Must never panic. +//! +//! Random text almost never forms a balanced `{k=>v,…}` map with the `=>` +//! separators in the right places, so byte mutation barely reaches the value +//! lexer. We instead consume the byte stream as grammar choices and emit a valid +//! map literal, with balanced braces, quoted keys/values with `\"`/`\\` +//! escapes, NULL values, and (in nested-value mode) embedded maps, so the +//! key/value split, the quote/escape state machine, and the recursive value +//! parse all run deep. A minority of inputs get structural noise spliced in so +//! the parser's error paths stay covered too. + +#![no_main] + +use libfuzzer_sys::arbitrary::{self, Unstructured}; +use libfuzzer_sys::fuzz_target; + +/// Emit a quoted-or-unquoted token (used for both keys and scalar values). +fn push_token(u: &mut Unstructured, out: &mut String) -> arbitrary::Result<()> { + if u.int_in_range(0u8..=1)? == 0 { + let n = u.int_in_range(1usize..=4)?; + for _ in 0..n { + out.push(*u.choose(&['a', 'b', '0', '1', '_', '-'])?); + } + } else { + out.push('"'); + let n = u.int_in_range(0usize..=4)?; + for _ in 0..n { + match u.int_in_range(0u8..=4)? { + 0 => out.push_str("\\\""), + 1 => out.push_str("\\\\"), + _ => out.push(*u.choose(&['a', '1', ' ', '{', '}', ',', '='])?), + } + } + out.push('"'); + } + Ok(()) +} + +fn gen_map(u: &mut Unstructured, depth: u32, out: &mut String) -> arbitrary::Result<()> { + out.push('{'); + let n = u.int_in_range(0usize..=3)?; + for i in 0..n { + if i > 0 { + out.push(','); + if u.int_in_range(0u8..=2)? == 0 { + out.push(' '); + } + } + push_token(u, out)?; // key + out.push_str("=>"); + if depth > 0 && u.int_in_range(0u8..=2)? == 0 { + gen_map(u, depth - 1, out)?; // nested map value + } else if u.int_in_range(0u8..=4)? == 0 { + out.push_str("NULL"); + } else { + push_token(u, out)?; // scalar value + } + } + out.push('}'); + Ok(()) +} + +fn run(mut u: Unstructured) -> arbitrary::Result<()> { + let mut s = String::new(); + gen_map(&mut u, 3, &mut s)?; + // 1-in-6: splice structural noise to exercise the error paths. + if u.int_in_range(0u8..=5)? == 0 { + s.push_str(u.choose(&["{", "}", ",", "=>", "=", "\"", "\\", " ", "NULL"])?); + } + for is_value_type_map in [false, true] { + let _ = mz_repr::strconv::parse_map(&s, is_value_type_map, |e| { + Ok::<_, std::convert::Infallible>(e.map(|v| v.into_owned())) + }); + } + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let _ = run(Unstructured::new(data)); +}); diff --git a/src/repr/fuzz/fuzz_targets/strconv_parse_numeric.rs b/src/repr/fuzz/fuzz_targets/strconv_parse_numeric.rs new file mode 100644 index 0000000000000..38e48f7a1ff92 --- /dev/null +++ b/src/repr/fuzz/fuzz_targets/strconv_parse_numeric.rs @@ -0,0 +1,125 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `strconv::parse_numeric` parses untrusted numeric literal text +//! into a fixed-precision decimal. Beyond not panicking, the canonical +//! standard-notation rendering must re-parse to the same value. +//! +//! Random text rarely lands near the interesting decimal edges, the 39-digit +//! precision limit, the exponent over/underflow boundary, or the +//! `NaN`/`Infinity` rejection arms, so byte mutation mostly bounces off the +//! initial `cx.parse`. We instead consume the byte stream as grammar choices and +//! emit numeric literals biased at those edges: long integer/fraction digit runs +//! straddling the precision cap (which triggers `munge_numeric` rounding and the +//! out-of-range arm), exponents near the representable range, optional signs and +//! surrounding whitespace (`parse_numeric` trims), and the special +//! `NaN`/`Infinity`/`-NaN`/signaling spellings that the post-parse validation +//! rejects. The strict round-trip oracle, canonical rendering must re-parse to +//! the same value, makes every parse that survives high signal. A quarter of +//! inputs are still the raw string so the reject paths stay covered. + +#![no_main] + +use libfuzzer_sys::arbitrary::{self, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_repr::strconv::parse_numeric; + +/// Special spellings: the infinity/NaN family the post-parse validation rejects +/// (or accepts as overflow-`inf`), plus assorted casing. +const SPECIALS: &[&str] = &[ + "NaN", "nan", "-NaN", "+NaN", "sNaN", "Infinity", "-Infinity", "+Infinity", "inf", "-inf", + "Inf", "INFINITY", +]; + +fn push_digits(u: &mut Unstructured, out: &mut String, n: usize) -> arbitrary::Result<()> { + for _ in 0..n { + out.push(*u.choose(&['0', '1', '5', '7', '9'])?); + } + Ok(()) +} + +fn run(mut u: Unstructured) -> arbitrary::Result<()> { + // A quarter of the time, the raw string: keeps the reject paths covered. + if u.int_in_range(0u8..=3)? == 0 { + if let Ok(s) = std::str::from_utf8(u.take_rest()) { + check(s); + } + return Ok(()); + } + + let mut s = String::new(); + // Optional leading whitespace (`parse_numeric` trims). + if u.int_in_range(0u8..=3)? == 0 { + s.push(*u.choose(&[' ', '\t', '\n'])?); + } + + match u.int_in_range(0u8..=5)? { + // A special infinity/NaN spelling (hits the rejection arms). + 0 => s.push_str(u.choose(SPECIALS)?), + _ => { + // Optional sign. + match u.int_in_range(0u8..=2)? { + 0 => s.push('-'), + 1 => s.push('+'), + _ => {} + } + // Integer part: a digit run straddling the 39-digit precision cap. + let int_len = u.int_in_range(0usize..=45)?; + push_digits(&mut u, &mut s, int_len)?; + // Optional fraction, also able to push total significant digits past + // the precision limit. + if u.int_in_range(0u8..=1)? == 0 { + s.push('.'); + let frac_len = u.int_in_range(0usize..=45)?; + // Guarantee at least one digit somewhere. + let frac_len = if int_len == 0 && frac_len == 0 { + 1 + } else { + frac_len + }; + push_digits(&mut u, &mut s, frac_len)?; + } else if int_len == 0 { + s.push('0'); + } + // Optional exponent near the representable range. + if u.int_in_range(0u8..=2)? == 0 { + s.push(*u.choose(&['e', 'E'])?); + match u.int_in_range(0u8..=2)? { + 0 => s.push('-'), + 1 => s.push('+'), + _ => {} + } + // Magnitudes around the overflow/underflow boundary. + let exp = u.choose(&["0", "9", "38", "39", "308", "1000", "6144", "999999"])?; + s.push_str(exp); + } + } + } + + // Optional trailing whitespace. + if u.int_in_range(0u8..=3)? == 0 { + s.push(*u.choose(&[' ', '\t', '\n'])?); + } + + check(&s); + Ok(()) +} + +fn check(s: &str) { + let Ok(n) = parse_numeric(s) else { + return; + }; + let formatted = n.0.to_standard_notation_string(); + let reparsed = parse_numeric(&formatted).expect("canonical numeric rendering must re-parse"); + assert_eq!(n, reparsed, "numeric changed across parse/format round trip"); +} + +fuzz_target!(|data: &[u8]| { + let _ = run(Unstructured::new(data)); +}); diff --git a/src/repr/fuzz/fuzz_targets/strconv_parse_range.rs b/src/repr/fuzz/fuzz_targets/strconv_parse_range.rs new file mode 100644 index 0000000000000..a48f1da53f5d8 --- /dev/null +++ b/src/repr/fuzz/fuzz_targets/strconv_parse_range.rs @@ -0,0 +1,97 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `strconv::parse_range` parses untrusted SQL range literal text +//! (`'[a,b)'`). Bound extraction with quoting/escaping. Must never panic. +//! +//! Random text rarely produces the `[`/`(` … `,` … `]`/`)` framing (or the +//! `empty` keyword) the parser needs, so byte mutation barely reaches the bound +//! lexer. We instead consume the byte stream as grammar choices and emit a +//! range literal exercising the bound extraction's sharp edges: +//! * the lower bound is read with a naive `take_while(c != ',')` and the upper +//! with `take_while(c not in ')]')`, neither of which understands quoting, +//! so we deliberately embed `,`, `]`, `)` *inside* quoted bounds to drive +//! that truncation; +//! * matched and mismatched bracket/paren framing (`[..)`, `(..]`, dropped +//! closer); +//! * optional (unbounded) sides, quoted bounds with `\"`/`\\` escapes, and the +//! `empty` keyword followed by junk (the "Junk after empty" arm). +//! A minority of inputs get extra structural noise spliced in so the parser's +//! error paths stay covered too. + +#![no_main] + +use libfuzzer_sys::arbitrary::{self, Unstructured}; +use libfuzzer_sys::fuzz_target; + +/// Emit one range bound: empty (unbounded), an unquoted token, or a quoted +/// string with escapes and structural delimiters (`,`/`]`/`)`) that the naive +/// `take_while` extraction does *not* know to keep inside the quotes. +fn push_bound(u: &mut Unstructured, out: &mut String) -> arbitrary::Result<()> { + match u.int_in_range(0u8..=2)? { + 0 => {} // unbounded + 1 => { + let n = u.int_in_range(1usize..=4)?; + for _ in 0..n { + out.push(*u.choose(&['a', '0', '1', '-', '.', ':'])?); + } + } + _ => { + out.push('"'); + let n = u.int_in_range(0usize..=6)?; + for _ in 0..n { + match u.int_in_range(0u8..=5)? { + 0 => out.push_str("\\\""), + 1 => out.push_str("\\\\"), + // Bias toward the range delimiters so they land *inside* the + // quotes, exercising the naive take_while truncation. + 2 | 3 => out.push(*u.choose(&[',', ']', ')', '['])?), + _ => out.push(*u.choose(&['a', '1', ' ', '('])?), + } + } + out.push('"'); + } + } + Ok(()) +} + +fn run(mut u: Unstructured) -> arbitrary::Result<()> { + let mut s = String::new(); + if u.int_in_range(0u8..=6)? == 0 { + s.push_str("empty"); + // Bias toward `empty` + trailing junk (the "Junk after empty" arm). + if u.int_in_range(0u8..=1)? == 0 { + for _ in 0..u.int_in_range(1usize..=3)? { + s.push(*u.choose(&[' ', ',', '[', ']', '(', ')', 'x', '"'])?); + } + } + } else { + // Independent open/close framing so mismatched brackets/parens arise. + s.push(if u.int_in_range(0u8..=1)? == 0 { '[' } else { '(' }); + push_bound(&mut u, &mut s)?; + s.push(','); + push_bound(&mut u, &mut s)?; + // Sometimes drop the closer entirely (truncated framing). + if u.int_in_range(0u8..=5)? != 0 { + s.push(if u.int_in_range(0u8..=1)? == 0 { ']' } else { ')' }); + } + } + // 1-in-5: splice extra structural noise to exercise the error paths. + if u.int_in_range(0u8..=4)? == 0 { + s.push_str(u.choose(&["[", "]", "(", ")", ",", "\"", "\\", " ", "empty"])?); + } + let _ = mz_repr::strconv::parse_range(&s, |e| { + Ok::<_, std::convert::Infallible>(e.into_owned()) + }); + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let _ = run(Unstructured::new(data)); +}); diff --git a/src/repr/fuzz/fuzz_targets/strconv_parse_time.rs b/src/repr/fuzz/fuzz_targets/strconv_parse_time.rs new file mode 100644 index 0000000000000..07a22be65983d --- /dev/null +++ b/src/repr/fuzz/fuzz_targets/strconv_parse_time.rs @@ -0,0 +1,36 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `strconv::parse_time` parses untrusted TIME literal text. + +#![no_main] + +use chrono::Timelike; +use libfuzzer_sys::fuzz_target; +use mz_repr::strconv::{format_time, parse_time}; + +fuzz_target!(|data: &str| { + let Ok(t) = parse_time(data) else { + return; + }; + // mz TIME keeps nanosecond precision (its cast, unlike TIMESTAMP's, does not + // round) but renders microseconds, and the parser accepts a leap second + // (`:60`) the renderer can't round-trip. Both are known TIME/PG-compat gaps + // tracked separately. Skip sub-microsecond and leap-second values. + let nanos = t.nanosecond(); + if nanos % 1_000 != 0 || nanos >= 1_000_000_000 { + return; + } + let mut buf = String::new(); + format_time(&mut buf, t); + let Ok(reparsed) = parse_time(&buf) else { + return; + }; + assert_eq!(t, reparsed, "time changed across parse/format round trip"); +}); diff --git a/src/repr/fuzz/fuzz_targets/strconv_parse_timestamp.rs b/src/repr/fuzz/fuzz_targets/strconv_parse_timestamp.rs new file mode 100644 index 0000000000000..d5ca025c3e0ff --- /dev/null +++ b/src/repr/fuzz/fuzz_targets/strconv_parse_timestamp.rs @@ -0,0 +1,51 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `strconv::parse_timestamp` parses untrusted TIMESTAMP literal +//! text. `CastStringToTimestamp` rounds the parsed value to the type's +//! precision (microseconds by default) before storage, so mirror that. + +#![no_main] + +use chrono::Timelike; +use libfuzzer_sys::fuzz_target; +use mz_repr::strconv::{format_timestamp, parse_timestamp}; + +fuzz_target!(|data: &str| { + let Ok(ts) = parse_timestamp(data) else { + return; + }; + // The parser accepts a leap second (`:60`), stored as chrono's leap + // representation (sub-second >= 1s). PostgreSQL carries `:60` to the next + // minute and our microsecond renderer mis-encodes the leap, so such values + // do not round-trip. This is a known parser/PG-compat gap tracked + // separately, not a panic. Skip them (rounding to precision below can't + // create a leap). + if ts.nanosecond() >= 1_000_000_000 { + return; + } + let Ok(ts) = ts.round_to_precision(None) else { + return; + }; + let mut buf = String::new(); + format_timestamp(&mut buf, &ts); + // The renderer can also emit text the parser rejects (e.g. a >4-digit + // year), also tracked separately. Tolerate that (re-parse failure) and only + // assert that a re-parseable rendering preserves the value, plus no panics. + let Ok(reparsed) = parse_timestamp(&buf) else { + return; + }; + let Ok(reparsed) = reparsed.round_to_precision(None) else { + return; + }; + assert_eq!( + ts, reparsed, + "timestamp changed across parse/format round trip" + ); +}); diff --git a/src/repr/fuzz/fuzz_targets/strconv_parse_timestamptz.rs b/src/repr/fuzz/fuzz_targets/strconv_parse_timestamptz.rs new file mode 100644 index 0000000000000..d4428d5b7a5c8 --- /dev/null +++ b/src/repr/fuzz/fuzz_targets/strconv_parse_timestamptz.rs @@ -0,0 +1,51 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `strconv::parse_timestamptz` parses untrusted TIMESTAMPTZ +//! literal text. `CastStringToTimestampTz` rounds to the type's precision +//! (microseconds by default) before storage, so mirror that. + +#![no_main] + +use chrono::Timelike; +use libfuzzer_sys::fuzz_target; +use mz_repr::strconv::{format_timestamptz, parse_timestamptz}; + +fuzz_target!(|data: &str| { + let Ok(ts) = parse_timestamptz(data) else { + return; + }; + // The parser accepts a leap second (`:60`), stored as chrono's leap + // representation (sub-second >= 1s). PostgreSQL carries `:60` to the next + // minute and our microsecond renderer mis-encodes the leap, so such values + // do not round-trip. This is a known parser/PG-compat gap tracked + // separately, not a panic. Skip them (rounding to precision below can't + // create a leap). + if ts.nanosecond() >= 1_000_000_000 { + return; + } + let Ok(ts) = ts.round_to_precision(None) else { + return; + }; + let mut buf = String::new(); + format_timestamptz(&mut buf, &ts); + // The renderer can also emit text the parser rejects (e.g. a >4-digit + // year), also tracked separately. Tolerate that (re-parse failure) and only + // assert that a re-parseable rendering preserves the value, plus no panics. + let Ok(reparsed) = parse_timestamptz(&buf) else { + return; + }; + let Ok(reparsed) = reparsed.round_to_precision(None) else { + return; + }; + assert_eq!( + ts, reparsed, + "timestamptz changed across parse/format round trip" + ); +}); diff --git a/src/repr/fuzz/fuzz_targets/strconv_parse_uuid.rs b/src/repr/fuzz/fuzz_targets/strconv_parse_uuid.rs new file mode 100644 index 0000000000000..0cc14da278b25 --- /dev/null +++ b/src/repr/fuzz/fuzz_targets/strconv_parse_uuid.rs @@ -0,0 +1,117 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `strconv::parse_uuid` decodes untrusted `uuid` text (COPY FROM, +//! a text-format `uuid` parameter on the wire). The `uuid` crate itself panics +//! while *constructing a parse error* for some short, brace-wrapped inputs +//! (e.g. `{}`, `{\0}`). It mis-slices the input, so `parse_uuid` pre-screens +//! the string before delegating. This is a regression guard for that fix: no +//! input may panic. +//! +//! Random text essentially never hits the `{`-wrapped / `urn:uuid:` / exactly-32 +//! / exactly-36-with-dashes shapes the crate special-cases (and the pre-screen +//! gates on `len >= 32` + ASCII), so byte mutation barely reaches the crash +//! boundary. We instead consume the byte stream as grammar choices and emit +//! inputs biased right at it: brace-wrapped bodies of length 30-34 with embedded +//! NUL / non-hex (the mis-slice trigger), `urn:uuid:` prefixes, exactly-32-hex +//! and 36-char dashed forms with a few corrupted bytes, and surrounding +//! whitespace (`parse_uuid` trims). A quarter of inputs are still the raw string +//! so the pre-screen reject paths and non-ASCII handling stay covered. + +#![no_main] + +use libfuzzer_sys::arbitrary::{self, Unstructured}; +use libfuzzer_sys::fuzz_target; + +const HEX: &[char] = &[ + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'A', 'F', +]; + +/// Push one body character: usually a hex digit, occasionally a corrupting byte +/// (NUL, non-hex ASCII, or a stray dash) to derail the crate's slicing. +fn push_body_char(u: &mut Unstructured, out: &mut String) -> arbitrary::Result<()> { + match u.int_in_range(0u8..=9)? { + 0 => out.push('\0'), + 1 => out.push(*u.choose(&['g', 'z', '-', ' ', '{', '}', ':'])?), + _ => out.push(*u.choose(HEX)?), + } + Ok(()) +} + +fn run(mut u: Unstructured) -> arbitrary::Result<()> { + // A quarter of the time, the raw string: keeps the pre-screen reject paths + // and non-ASCII handling covered. + if u.int_in_range(0u8..=3)? == 0 { + if let Ok(s) = std::str::from_utf8(u.take_rest()) { + let _ = mz_repr::strconv::parse_uuid(s); + } + return Ok(()); + } + + let mut s = String::new(); + // Optional leading whitespace (`parse_uuid` trims). + if u.int_in_range(0u8..=4)? == 0 { + s.push(*u.choose(&[' ', '\t', '\n'])?); + } + + match u.int_in_range(0u8..=4)? { + // Brace-wrapped body whose total length straddles the pre-screen's + // 32-byte floor, the historical mis-slice / panic boundary. + 0 | 1 => { + s.push('{'); + let n = u.int_in_range(28usize..=32)?; + for _ in 0..n { + push_body_char(&mut u, &mut s)?; + } + // Sometimes omit the closing brace (unbalanced framing). + if u.int_in_range(0u8..=2)? != 0 { + s.push('}'); + } + } + // `urn:uuid:` prefixed form. + 2 => { + s.push_str("urn:uuid:"); + emit_canonical(&mut u, &mut s)?; + } + // Exactly-32 hex (simple form), with a few corrupted bytes. + 3 => { + for _ in 0..32 { + push_body_char(&mut u, &mut s)?; + } + } + // 36-char dashed (hyphenated) form, with a few corrupted bytes. + _ => emit_canonical(&mut u, &mut s)?, + } + + // Optional trailing whitespace. + if u.int_in_range(0u8..=4)? == 0 { + s.push(*u.choose(&[' ', '\t', '\n'])?); + } + + let _ = mz_repr::strconv::parse_uuid(&s); + Ok(()) +} + +/// Emit the canonical `8-4-4-4-12` hyphenated form, with `push_body_char` so a +/// few bytes can be corrupted. +fn emit_canonical(u: &mut Unstructured, out: &mut String) -> arbitrary::Result<()> { + for (i, &group) in [8usize, 4, 4, 4, 12].iter().enumerate() { + if i > 0 { + out.push('-'); + } + for _ in 0..group { + push_body_char(u, out)?; + } + } + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let _ = run(Unstructured::new(data)); +}); diff --git a/src/sql-parser/fuzz/.gitignore b/src/sql-parser/fuzz/.gitignore new file mode 100644 index 0000000000000..894cb83fe9df6 --- /dev/null +++ b/src/sql-parser/fuzz/.gitignore @@ -0,0 +1,6 @@ +target/ +corpus/ +artifacts/ +coverage/ +Cargo.lock +corpus.dict diff --git a/src/sql-parser/fuzz/Cargo.toml b/src/sql-parser/fuzz/Cargo.toml new file mode 100644 index 0000000000000..491713537d24c --- /dev/null +++ b/src/sql-parser/fuzz/Cargo.toml @@ -0,0 +1,48 @@ +# Fuzz crate for mz-sql-parser and mz-sql-pretty round-trip properties. +# +# Excluded from the main workspace because libFuzzer requires nightly Rust. +# Run via the repo-wide runner: `bin/ci-builder run nightly ci/test/cargo-fuzz.sh`, +# or locally: +# cd src/sql-parser/fuzz +# cargo +nightly fuzz run sql_roundtrip -- -max_total_time=60 +# +# A single round-trip oracle (`parse -> AstDisplay/pretty -> reparse` must +# preserve the AST). `sql_roundtrip` generates its input from the libFuzzer byte +# stream as grammar choices: mostly a structured statement (deep, valid, the +# round-trip the printer must preserve), with a minority of full-vocabulary +# "soup" (every keyword/operator/literal form + random characters, to reach +# query shapes the structured productions don't enumerate and the parser/lexer +# error paths). It interprets bytes as choices, so it needs no seed corpus or +# dictionary. + +[package] +workspace = "../../../test/cargo-fuzz" +name = "mz-sql-parser-fuzz" +version = "0.0.0" +publish = false +edition = "2021" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" +mz-sql-parser = { path = ".." } +mz-sql-pretty = { path = "../../sql-pretty" } + +[[bin]] +name = "sql_roundtrip" +path = "fuzz_targets/sql_roundtrip.rs" +test = false +doc = false +bench = false + +# Same round-trip oracle as `sql_roundtrip`, but the input language is generated +# from the declarative grammar in `grammar/sql.g`, compiled to a rule table by +# `build.rs`. Extend the SQL surface by editing the grammar file. +[[bin]] +name = "grammar" +path = "fuzz_targets/grammar.rs" +test = false +doc = false +bench = false diff --git a/src/sql-parser/fuzz/build.rs b/src/sql-parser/fuzz/build.rs new file mode 100644 index 0000000000000..6bf65989fbb93 --- /dev/null +++ b/src/sql-parser/fuzz/build.rs @@ -0,0 +1,205 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Compiles `grammar/sql.g` into a Rust rule table (`$OUT_DIR/grammar.rs`) that +//! the `grammar` fuzz target walks. Running this at build time keeps the grammar +//! file the single source of truth. There is no checked-in generated file to +//! drift. See `grammar/sql.g` for the format. + +use std::env; +use std::fs; +use std::path::Path; + +#[derive(Clone)] +enum Item { + Lit(String), + Ref(String), + Ident, + Kw, + Int, + Str, +} + +fn main() { + let manifest = env::var("CARGO_MANIFEST_DIR").expect("CARGO_MANIFEST_DIR"); + let grammar_path = Path::new(&manifest).join("grammar/sql.g"); + println!("cargo:rerun-if-changed={}", grammar_path.display()); + let src = fs::read_to_string(&grammar_path) + .unwrap_or_else(|e| panic!("reading {}: {e}", grammar_path.display())); + + // Drop comment lines, keep the rest as one stream. + let mut text = String::new(); + for line in src.lines() { + if line.trim_start().starts_with('#') { + continue; + } + text.push_str(line); + text.push('\n'); + } + + // Each rule is `name = body` terminated by an unquoted `;`. + let mut names: Vec = Vec::new(); + let mut bodies: Vec = Vec::new(); + for chunk in split_unquoted(&text, ';') { + let chunk = chunk.trim(); + if chunk.is_empty() { + continue; + } + let eq = find_unquoted(chunk, '=').unwrap_or_else(|| panic!("rule has no `=`: {chunk:?}")); + names.push(chunk[..eq].trim().to_string()); + bodies.push(chunk[eq + 1..].to_string()); + } + if names.is_empty() { + panic!("grammar has no rules"); + } + + let index_of = |name: &str| -> usize { + names + .iter() + .position(|n| n == name) + .unwrap_or_else(|| panic!("reference to undefined rule `{name}`")) + }; + + // Parse each body into alternatives of items. + let rules: Vec>> = bodies + .iter() + .map(|body| split_unquoted(body, '|').iter().map(|a| tokenize(a)).collect()) + .collect(); + + // Emit the rule table. + let mut out = String::new(); + out.push_str("// @generated by build.rs from grammar/sql.g, do not edit.\n"); + out.push_str("#[derive(Clone, Copy)]\n"); + out.push_str("pub enum Item { Lit(&'static str), Rule(usize), Ident, Kw, Int, Str }\n"); + out.push_str("pub struct Rule { pub alts: &'static [&'static [Item]], pub leaf_alt: usize }\n"); + out.push_str("pub static RULES: &[Rule] = &[\n"); + for alts in &rules { + // The "leaf" alternative (fewest rule references, then fewest items) is + // used once the recursion-depth budget is spent, so generation always + // winds down to terminals. + let leaf_alt = alts + .iter() + .enumerate() + .min_by_key(|(_, a)| { + let refs = a.iter().filter(|i| matches!(i, Item::Ref(_))).count(); + (refs, a.len()) + }) + .map(|(i, _)| i) + .unwrap_or(0); + out.push_str(" Rule { alts: &[\n"); + for alt in alts { + out.push_str(" &["); + for item in alt { + match item { + Item::Lit(s) => out.push_str(&format!("Item::Lit({s:?}), ")), + Item::Ref(n) => out.push_str(&format!("Item::Rule({}), ", index_of(n))), + Item::Ident => out.push_str("Item::Ident, "), + Item::Kw => out.push_str("Item::Kw, "), + Item::Int => out.push_str("Item::Int, "), + Item::Str => out.push_str("Item::Str, "), + } + } + out.push_str("],\n"); + } + out.push_str(&format!(" ], leaf_alt: {leaf_alt} }},\n")); + } + out.push_str("];\n"); + out.push_str(&format!("pub static START: usize = {};\n", index_of("statement"))); + + let out_dir = env::var("OUT_DIR").expect("OUT_DIR"); + fs::write(Path::new(&out_dir).join("grammar.rs"), out).expect("write grammar.rs"); +} + +/// Split `s` on `sep`, ignoring `sep` inside double-quoted literals (so a `|` or +/// `;` inside a `"..."` terminal is not a separator). Honors `\`-escapes inside +/// quotes. +fn split_unquoted(s: &str, sep: char) -> Vec { + let mut parts = Vec::new(); + let mut cur = String::new(); + let mut in_quote = false; + let mut chars = s.chars(); + while let Some(c) = chars.next() { + if c == '"' { + in_quote = !in_quote; + cur.push(c); + } else if c == '\\' && in_quote { + cur.push(c); + if let Some(n) = chars.next() { + cur.push(n); + } + } else if c == sep && !in_quote { + parts.push(std::mem::take(&mut cur)); + } else { + cur.push(c); + } + } + parts.push(cur); + parts +} + +/// Byte index of the first `ch` not inside a double-quoted literal. +fn find_unquoted(s: &str, ch: char) -> Option { + let mut in_quote = false; + let mut escaped = false; + for (i, c) in s.char_indices() { + if escaped { + escaped = false; + } else if c == '\\' && in_quote { + escaped = true; + } else if c == '"' { + in_quote = !in_quote; + } else if c == ch && !in_quote { + return Some(i); + } + } + None +} + +/// Tokenize one alternative into items: `"literal"` terminals, `@builtin`s, and +/// bare rule references. +fn tokenize(alt: &str) -> Vec { + let chars: Vec = alt.chars().collect(); + let mut items = Vec::new(); + let mut i = 0; + while i < chars.len() { + if chars[i].is_whitespace() { + i += 1; + continue; + } + if chars[i] == '"' { + i += 1; + let mut lit = String::new(); + while i < chars.len() && chars[i] != '"' { + if chars[i] == '\\' && i + 1 < chars.len() { + lit.push(chars[i + 1]); + i += 2; + } else { + lit.push(chars[i]); + i += 1; + } + } + i += 1; // consume closing quote + items.push(Item::Lit(lit)); + } else { + let start = i; + while i < chars.len() && !chars[i].is_whitespace() { + i += 1; + } + let word: String = chars[start..i].iter().collect(); + items.push(match word.as_str() { + "@ident" => Item::Ident, + "@kw" => Item::Kw, + "@int" => Item::Int, + "@str" => Item::Str, + _ => Item::Ref(word), + }); + } + } + items +} diff --git a/src/sql-parser/fuzz/fuzz_targets/grammar.rs b/src/sql-parser/fuzz/fuzz_targets/grammar.rs new file mode 100644 index 0000000000000..3760e29e71ee2 --- /dev/null +++ b/src/sql-parser/fuzz/fuzz_targets/grammar.rs @@ -0,0 +1,286 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: the SQL round-trip oracle (`parse -> AstDisplay/pretty -> +//! reparse` must preserve the AST), driven by a **declarative grammar file** +//! (`grammar/sql.g`) rather than a hand-written generator. +//! +//! `build.rs` compiles `grammar/sql.g` into a rule table (`Item`/`Rule`/`RULES`/ +//! `START`, included below). This target walks that table from the start rule, +//! consuming the libFuzzer byte stream as a sequence of production choices, to +//! emit a syntactically plausible statement. Coverage feedback steers the +//! byte->choice mapping. Each generated statement that parses is checked against +//! the same two oracles as the hand-written `sql_roundtrip` target: full-AST +//! equality through `pretty_str_simple` at two line widths, and stable-string +//! equality through `AstDisplay`. +//! +//! The grammar file is the single source of truth for the input language: extend +//! the SQL surface by editing `grammar/sql.g`, with no generator code to touch. +//! Non-parsing output is a harmless no-op (parsing must merely never panic). The +//! grammar is written so the bulk of output is valid and exercises the printer. + +#![no_main] + +use libfuzzer_sys::arbitrary::Unstructured; +use libfuzzer_sys::fuzz_target; +use mz_sql_parser::ast::display::AstDisplay; +use mz_sql_parser::ast::visit_mut::{self, VisitMut}; +use mz_sql_parser::ast::{AstInfo, Expr, Op, Raw, Statement, Value}; +use mz_sql_parser::parser::parse_statements; +use mz_sql_pretty::pretty_str_simple; + +// The grammar rule table compiled from `grammar/sql.g` by `build.rs`. Provides +// `enum Item`, `struct Rule`, `static RULES: &[Rule]`, and `static START`. +include!(concat!(env!("OUT_DIR"), "/grammar.rs")); + +// --------------------------------------------------------------------------- +// Round-trip oracle (mirrors sql_roundtrip / parse_pretty_roundtrip). +// --------------------------------------------------------------------------- + +/// Strip syntactic noise so AST equality reflects *semantic* fidelity: +/// `Declare`/`Prepare` capture raw text, `Expr::Nested` records parens the +/// printer is free to add or drop, and a negative numeric literal is the same +/// value whether the parser folded the sign in (`Number("-1")`) or left a unary +/// op (`- 1`). The parser chooses by *context* (a leading `- 1` folds, `a + - 1` +/// does not), so the two forms must compare equal. +fn normalize(stmt: &mut Statement) { + match stmt { + Statement::Declare(d) => { + d.sql.clear(); + normalize(&mut d.stmt); + } + Statement::Prepare(p) => { + p.sql.clear(); + normalize(&mut p.stmt); + } + _ => {} + } + RemoveParens.visit_statement_mut(stmt); +} + +struct RemoveParens; + +impl<'a, T: AstInfo> VisitMut<'a, T> for RemoveParens { + fn visit_expr_mut(&mut self, expr: &'a mut Expr) { + visit_mut::visit_expr_mut(self, expr); + if let Expr::Nested(inner) = expr { + *expr = (**inner).clone(); + } + // Canonicalize a negative numeric literal to a unary minus over the bare + // number, so it compares equal to the unfolded `- ` form the + // parser produces in non-leading position. (Positive literals are never + // sign-prefixed by the parser, so only `-` needs handling.) + if let Expr::Value(Value::Number(n)) = expr { + if let Some(rest) = n.strip_prefix('-') { + *expr = Expr::Op { + op: Op::bare("-"), + expr1: Box::new(Expr::Value(Value::Number(rest.to_string()))), + expr2: None, + }; + } + } + } +} + +/// Reparse errors that are a known printer/parser asymmetry rather than a bug. +fn benign_reparse_error(msg: &str) -> bool { + msg.contains("exceeds nested expression limit") + || msg.contains("Expected left square bracket") + || msg.contains("Expected left parenthesis") + || msg.contains("Expected IN, found") + || msg.contains("Expected arrow, found") +} + +fn check_pretty(sql: &str, orig_ast: &Statement) { + for width in [100, 1] { + let pretty = match pretty_str_simple(sql, width) { + Ok(p) => p, + Err(e) => panic!("parsed but pretty failed: input={sql:?} width={width} err={e}"), + }; + let reparsed = match parse_statements(&pretty) { + Ok(r) => r, + Err(e) => { + if benign_reparse_error(&e.to_string()) { + continue; + } + panic!("pretty output failed to reparse: pretty={pretty:?} width={width} err={e}"); + } + }; + let Some(stmt) = reparsed.into_iter().next() else { + continue; + }; + let mut reparsed_ast = stmt.ast; + normalize(&mut reparsed_ast); + assert_eq!( + *orig_ast, reparsed_ast, + "AST changed through pretty roundtrip\ninput: {sql:?}\nwidth: {width}\npretty: {pretty:?}" + ); + } +} + +fn check_display(orig_ast: &Statement) { + let displayed = orig_ast.to_ast_string_simple(); + let reparsed = match parse_statements(&displayed) { + Ok(r) => r, + Err(e) => { + if benign_reparse_error(&e.to_string()) { + return; + } + panic!("AstDisplay output failed to reparse: displayed={displayed:?} err={e}"); + } + }; + if reparsed.len() != 1 { + return; + } + let mut reparsed_ast = reparsed.into_iter().next().unwrap().ast; + // Normalize the reparse too (mirroring `check_pretty`): the parser may + // re-insert a semantically-redundant `Expr::Nested` (e.g. it parenthesizes a + // cast under unary minus), and per the oracle's contract those parens are + // free to add or drop. Stripping them from both sides leaves a genuine + // *structural* drift to still trip the assert. + normalize(&mut reparsed_ast); + // Compare ASTs *structurally*, not by re-printed string. A printer that drops + // a needed paren can map two distinct ASTs onto the same string (e.g. + // `IsExpr(a, DistinctFrom(Or(b, c)))` and `Or(IsExpr(a, DistinctFrom(b)), c)` + // both print `a IS DISTINCT FROM b OR c`). A stable-string comparison is blind + // to those collisions, but the structural comparison catches them. The stable + // strings are still shown for a readable diff. + assert_eq!( + *orig_ast, + reparsed_ast, + "AstDisplay roundtrip drifted\ndisplayed: {displayed:?}\norig: {}\nreparsed: {}", + orig_ast.to_ast_string_stable(), + reparsed_ast.to_ast_string_stable(), + ); +} + +// --------------------------------------------------------------------------- +// Grammar walker. +// --------------------------------------------------------------------------- + +/// Identifiers for `@ident`, weighted toward the cases that exercise the +/// printer's quoting decision (bare names, quoted keyword collisions, names that +/// only round-trip when quoted). +const IDENTS: &[&str] = &[ + "a", "b", "c", "x", "y", "col", "foo", "bar", "t1", "t2", "\"select\"", "\"from\"", "\"any\"", + "\"Mixed\"", "\"with space\"", "\"a.b\"", "\"1col\"", "\"qu\"\"ote\"", +]; + +/// String literals for `@str`, weighted toward lexing/escaping edge cases. +const STRINGS: &[&str] = &[ + "'a'", "''", "'foo bar'", "'it''s'", "'a\"b'", "'%'", "'_'", "'100'", +]; + +/// Every keyword the lexer knows, for `@kw` (a bare keyword used as an +/// identifier, the printer-quoting property nearly every round-trip bug we've +/// found violated). Read from the lexer's list so it stays complete. +fn keywords() -> &'static [&'static str] { + use std::sync::OnceLock; + static KW: OnceLock> = OnceLock::new(); + KW.get_or_init(|| { + include_str!("../../../sql-lexer/src/keywords.txt") + .lines() + .map(str::trim) + .filter(|l| !l.is_empty() && !l.starts_with('#')) + .collect() + }) +} + +/// Recursion-depth budget for one generated statement. +const MAX_DEPTH: u32 = 6; +/// Absolute recursion guard, independent of `MAX_DEPTH`, so a cyclic "leaf" +/// alternative in the grammar can never spin forever. +const MAX_CALLS: u32 = 400; +/// Hard cap on generated length: structure-aware generation can otherwise blow +/// up through nested subqueries. +const MAX_OUTPUT: usize = 2000; + +struct Gen<'a, 'u> { + u: &'u mut Unstructured<'a>, + out: String, +} + +impl<'a, 'u> Gen<'a, 'u> { + /// A uniform choice in `0..n` (0 when the byte stream is exhausted, so the + /// generator deterministically winds down at the end of input). + fn pick(&mut self, n: usize) -> usize { + if n <= 1 { + return 0; + } + self.u + .int_in_range(0..=(n as u64 - 1)) + .map(|v| v as usize) + .unwrap_or(0) + } + + fn one_of(&mut self, opts: &[&str]) { + let i = self.pick(opts.len()); + self.out.push_str(opts[i]); + } + + /// Expand rule `idx`. `depth` is the grammar recursion budget, `calls` is the + /// absolute call guard. Once either is spent (or the output cap is hit), the + /// rule's precomputed `leaf_alt` is used and rule references stop recursing. + fn gen(&mut self, idx: usize, depth: u32, calls: u32) { + if self.out.len() >= MAX_OUTPUT || calls >= MAX_CALLS { + return; + } + let rule = &RULES[idx]; + let exhausted = depth == 0; + let alt = if exhausted { + rule.alts[rule.leaf_alt] + } else { + rule.alts[self.pick(rule.alts.len())] + }; + for item in alt { + match item { + Item::Lit(s) => self.out.push_str(s), + Item::Rule(i) => self.gen(*i, depth.saturating_sub(1), calls + 1), + Item::Ident => self.one_of(IDENTS), + Item::Kw => { + let kw = keywords(); + let i = self.pick(kw.len()); + self.out.push_str(kw[i]); + } + Item::Int => { + let n = self.pick(1000); + self.out.push_str(&n.to_string()); + } + Item::Str => self.one_of(STRINGS), + } + } + } +} + +fn run(data: &[u8]) { + let mut u = Unstructured::new(data); + let mut g = Gen { + u: &mut u, + out: String::new(), + }; + g.gen(START, MAX_DEPTH, 0); + let sql = g.out; + + // Parsing must never panic. Only a single parseable statement is checked + // against the round-trip oracle (matching `sql_roundtrip`). + let Ok(parsed) = parse_statements(&sql) else { + return; + }; + if parsed.len() != 1 { + return; + } + let mut ast = parsed.into_iter().next().unwrap().ast; + normalize(&mut ast); + check_pretty(&sql, &ast); + check_display(&ast); +} + +fuzz_target!(|data: &[u8]| { + run(data); +}); diff --git a/src/sql-parser/fuzz/fuzz_targets/sql_roundtrip.rs b/src/sql-parser/fuzz/fuzz_targets/sql_roundtrip.rs new file mode 100644 index 0000000000000..f8b00c4c9ee70 --- /dev/null +++ b/src/sql-parser/fuzz/fuzz_targets/sql_roundtrip.rs @@ -0,0 +1,2090 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: the SQL round-trip oracle. `parse -> AstDisplay/pretty -> +//! reparse` must preserve the AST. It consumes the libFuzzer byte stream as a +//! sequence of *grammar choices* (not raw text, which mostly fails to parse so +//! the round-trip is a no-op) and, on most runs, emits a syntactically plausible +//! SQL query that reaches deep into the query/expression/DDL grammar. Byte +//! mutation rarely assembles a valid instance of those print paths. Coverage +//! feedback steers the byte→choice mapping. +//! +//! A minority of runs (`Gen::soup`) instead emit *full-vocabulary soup*: a +//! random sequence drawn from every keyword, identifier, literal form, operator/ +//! symbol, and raw characters. Almost none parses, but the rare soup that does +//! reaches query shapes the hand-written productions never assemble (this is how +//! the `COPY into …` round-trip bug surfaced: a bare `into` relation re-lexed as +//! `COPY`'s optional `INTO` keyword), and the rest exercises the parser/lexer +//! error paths (which must never panic/hang). Same byte→choice model, same +//! oracle, so soup and structured generation share one target. +//! +//! The generator is deliberately biased toward the constructs that stress the +//! printer's quoting and parenthesization logic, which is where every round-trip +//! bug we've found lives: identifiers/function names that collide with keywords +//! (forcing quoting), varied operator-precedence groupings (forcing +//! re-parenthesization), casts under unary operators, quantified comparisons +//! against both a subquery (`op ANY/ALL/SOME (SELECT …)`) and an array value +//! (`op ANY/ALL/SOME (ARRAY[…])`, the distinct `Expr::AnyExpr`/`AllExpr` +//! nodes), the special-grammar forms (`EXTRACT`, `POSITION`, `SUBSTRING`, +//! `TRIM`, `CAST`, `MAP`, `LIST[…]`/`LIST(SELECT …)`, `ARRAY`, `ROW`), the +//! dedicated-node forms `GREATEST`/`LEAST` (`Expr::HomogenizingFunction`), +//! `NULLIF` (`Expr::NullIf`), `NORMALIZE` (a re-quoted `normalize(...)` call), +//! `LIKE`/`ILIKE … ESCAPE ` (the optional `escape` of `Expr::Like`), the +//! `$N` placeholder (`Expr::Parameter`), window frames, CTEs, and set +//! operations. +//! +//! It also assembles the **connector DDL family** the query-centric arms can't +//! reach (`connector_ddl`): `CREATE TABLE … FROM SOURCE` (the subsource +//! REFERENCE / column-or-constraint spec / `WITH (TEXT COLUMNS|EXCLUDE +//! COLUMNS|DETAILS …)`), `CREATE SOURCE` (load-generator and connection-backed +//! Kafka with FORMAT/ENVELOPE), `CREATE SINK … INTO KAFKA`, `CREATE CONNECTION`, +//! `CREATE … FROM WEBHOOK`, and the surrounding `SHOW CREATE` / `EXPLAIN … FOR` +//! / `ALTER SOURCE … ADD SUBSOURCE` / `VALIDATE CONNECTION` forms. These are the +//! option-heavy printers (formats, envelopes, source/sink options) the +//! structured generator otherwise never exercises. Only round-trip-proven +//! shapes are emitted. The generator varies the names and embedded queries. +//! (Note this is the `CREATE TABLE … FROM SOURCE` family from database-issues +//! #10034: that bug round-trips through parse/print cleanly, its defect is in +//! re-execution, so it is out of reach for *any* parser-level oracle. The +//! statement's printer is still covered for the bugs this target can catch.) +//! +//! Each generated query that parses is checked against the same two oracles as +//! `parse_pretty_roundtrip` (full-AST equality through `pretty_str_simple`, +//! verified at two line widths so wrapping can't perturb the AST) and +//! `parse_display_roundtrip` (stable-string equality through `AstDisplay`). + +#![no_main] + +use std::collections::BTreeMap; +use std::sync::{Mutex, OnceLock}; + +use libfuzzer_sys::arbitrary::{Arbitrary, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_sql_parser::ast::display::AstDisplay; +use mz_sql_parser::ast::visit_mut::{self, VisitMut}; +use mz_sql_parser::ast::{AstInfo, Expr, Raw, Statement}; +use mz_sql_parser::parser::parse_statements; +use mz_sql_pretty::pretty_str_simple; + +// --------------------------------------------------------------------------- +// Round-trip oracle (mirrors parse_pretty_roundtrip / parse_display_roundtrip). +// --------------------------------------------------------------------------- + +/// Strip syntactic noise so AST equality reflects *semantic* fidelity: +/// `Declare`/`Prepare` capture raw text, and `Expr::Nested` records parens that +/// the printer is free to add or drop. See `parse_pretty_roundtrip` for detail. +fn normalize(stmt: &mut Statement) { + match stmt { + Statement::Declare(d) => { + d.sql.clear(); + normalize(&mut d.stmt); + } + Statement::Prepare(p) => { + p.sql.clear(); + normalize(&mut p.stmt); + } + _ => {} + } + RemoveParens.visit_statement_mut(stmt); +} + +struct RemoveParens; + +impl<'a, T: AstInfo> VisitMut<'a, T> for RemoveParens { + fn visit_expr_mut(&mut self, expr: &'a mut Expr) { + visit_mut::visit_expr_mut(self, expr); + if let Expr::Nested(inner) = expr { + *expr = (**inner).clone(); + } + } +} + +/// Reparse errors that are a known printer/parser asymmetry rather than a bug. +fn benign_reparse_error(msg: &str) -> bool { + msg.contains("exceeds nested expression limit") + || msg.contains("Expected left square bracket") + || msg.contains("Expected left parenthesis") + || msg.contains("Expected IN, found") + || msg.contains("Expected arrow, found") +} + +fn check_pretty(sql: &str, orig_ast: &Statement) { + // The line width must not affect the AST: wrapping is purely cosmetic, so + // both a wide layout (everything on one line) and a narrow one (maximally + // wrapped) must reparse to the same AST. Checking two widths catches a + // wrapping that drops/adds a token only on the path it takes at one width. + for width in [100, 1] { + let pretty = match pretty_str_simple(sql, width) { + Ok(p) => p, + Err(e) => panic!("parsed but pretty failed: input={sql:?} width={width} err={e}"), + }; + let reparsed = match parse_statements(&pretty) { + Ok(r) => r, + Err(e) => { + if benign_reparse_error(&e.to_string()) { + continue; + } + panic!("pretty output failed to reparse: pretty={pretty:?} width={width} err={e}"); + } + }; + let Some(stmt) = reparsed.into_iter().next() else { + continue; + }; + let mut reparsed_ast = stmt.ast; + normalize(&mut reparsed_ast); + assert_eq!( + *orig_ast, reparsed_ast, + "AST changed through pretty roundtrip\ninput: {sql:?}\nwidth: {width}\npretty: {pretty:?}" + ); + } +} + +fn check_display(orig_ast: &Statement) { + let displayed = orig_ast.to_ast_string_simple(); + let reparsed = match parse_statements(&displayed) { + Ok(r) => r, + Err(e) => { + if benign_reparse_error(&e.to_string()) { + return; + } + panic!("AstDisplay output failed to reparse: displayed={displayed:?} err={e}"); + } + }; + if reparsed.len() != 1 { + return; + } + let reparsed_ast = reparsed.into_iter().next().unwrap().ast; + assert_eq!( + orig_ast.to_ast_string_stable(), + reparsed_ast.to_ast_string_stable(), + "AstDisplay roundtrip drifted\ndisplayed: {displayed:?}" + ); +} + +// --------------------------------------------------------------------------- +// Grammar generator. +// --------------------------------------------------------------------------- + +/// Identifiers, weighted toward the cases that exercise the printer's quoting +/// decision: bare names, quoted keyword collisions (parse as identifiers but +/// the printer must keep them quoted or they re-lex as keywords), and names +/// that only round-trip when quoted (case, spaces, dots, embedded quotes, +/// leading digit). +const IDENTS: &[&str] = &[ + "a", + "b", + "c", + "x", + "y", + "z", + "t", + "u", + "col", + "foo", + "bar", + "t1", + "t2", + "\"some\"", + "\"any\"", + "\"all\"", + "\"select\"", + "\"from\"", + "\"where\"", + "\"map\"", + "\"list\"", + "\"array\"", + "\"position\"", + "\"trim\"", + "\"as\"", + "\"order\"", + "\"group\"", + "\"row\"", + "\"exists\"", + "\"coalesce\"", + "\"in\"", + "\"Mixed\"", + "\"with space\"", + "\"a.b\"", + "\"1col\"", + "\"qu\"\"ote\"", + "\"\"", +]; + +/// Every keyword the lexer knows, read at compile time from the lexer's keyword +/// list (so it stays complete as keywords are added). `ident()` emits these +/// *bare*: whether the printer keeps a keyword-as-identifier unambiguous +/// (quoting it, or emitting a disambiguating keyword) is the round-trip property +/// nearly every printer bug we've found violated, such as `AS`, `LIST`, +/// `ANY`/`ALL`/`SOME`, and the `DEALLOCATE … PREPARE` finding. The pre-baked +/// `IDENTS` only has *quoted* collisions, which are already safe by +/// construction, so they never exercise that decision. +fn keywords() -> &'static [&'static str] { + static KW: OnceLock> = OnceLock::new(); + KW.get_or_init(|| { + include_str!("../../../sql-lexer/src/keywords.txt") + .lines() + .map(str::trim) + .filter(|l| !l.is_empty() && !l.starts_with('#')) + .collect() + }) +} + +/// Bare scalar type names (no recursive array/list/map wrappers). +const SCALAR_TYPES: &[&str] = &[ + "int2", + "int4", + "int8", + "integer", + "smallint", + "bigint", + "real", + "double precision", + "float", + "numeric", + "numeric(10, 2)", + "decimal(5)", + "boolean", + "bool", + "text", + "varchar(10)", + "char(5)", + "bytea", + "date", + "time", + "timestamp", + "timestamp(3)", + "timestamptz", + "interval", + "jsonb", + "uuid", + "oid", +]; + +/// Scalar/value literals, including ones with tricky lexing (escaped quote, +/// leading/trailing dot, exponent) and typed literals. +const VALUES: &[&str] = &[ + "0", + "1", + "2", + "42", + "3.14", + "0.5", + ".5", + "1.", + "1e10", + "1.5e-3", + "'a'", + "''", + "'foo bar'", + "'it''s'", + "'%'", + "'_'", + "'100'", + "true", + "false", + "null", + "INTERVAL '1' DAY", + "INTERVAL '1-2' YEAR TO MONTH", + "INTERVAL '1 2:03:04' DAY TO SECOND", + "DATE '2020-01-01'", + "TIMESTAMP '2020-01-01 00:00:00'", + "b'010'", + "x'deadbeef'", +]; + +/// Binary operators spanning precedence levels and the custom jsonb/range ops. +const BIN_OPS: &[&str] = &[ + "+", "-", "*", "/", "%", "||", "=", "<>", "!=", "<", ">", "<=", ">=", "AND", "OR", "->", "->>", + "#>", "#>>", "@>", "<@", "#", "&", "|", "<<", ">>", "~", "~*", "!~", "!~*", +]; + +/// "Noise" tokens spliced into output at random points (see `Gen::maybe_noise`). +/// Mostly punctuation/operators/odd literals plus comment and dollar-quote +/// starts: emitting them, alongside bare keywords and arbitrary characters, +/// produces mostly-invalid SQL that exercises the parser/lexer's error paths +/// (which must never panic on any input), and occasionally a valid-but-unusual +/// statement the structured grammar wouldn't assemble. +const NOISE: &[&str] = &[ + "(", ")", "[", "]", "{", "}", ",", ";", ".", "::", ":", "*", "@", "?", "!", "\\", "\"", "'", + "->", "->>", "#>>", "||", "<>", "=>", "%", "~", "&", "|", "$1", "$$", "''", "\"\"", "/*", "*/", + "--", " ", "\t", "\n", "1e999", "0x1", "-0", ".", "e", "E'\\x41'", "U&'\\0041'", +]; + +// The parser's AST source, embedded so the fuzzed connector option space stays +// complete as options are ADDED, the same self-syncing trick `keywords()` uses +// for the lexer's keyword list. A hardcoded option-name snapshot would silently +// rot: a new `KafkaSinkConfigOptionName` variant would go un-fuzzed until +// someone remembered to update the fuzzer. `option_names` instead reads each +// enum's display phrases straight from here at first use. +const DDL_SRC: &str = include_str!("../../src/ast/defs/ddl.rs"); +const STMT_SRC: &str = include_str!("../../src/ast/defs/statement.rs"); + +/// The display phrases of every variant of the option-name enum `enum_name`, +/// scraped from `impl AstDisplay for ` in the embedded AST source +/// (every string literal in that impl body is an option phrase). Returns +/// 'static slices into the source, the phrases aren't allocated. New options +/// are picked up automatically. An option whose value grammar isn't the generic +/// one just falls through `config_option`'s generic arm (a harmless no-op parse +/// for the special-grammar ones, a valid clause for the rest) until a dedicated +/// case is added here. +fn option_names(enum_name: &'static str) -> &'static [&'static str] { + static CACHE: OnceLock>> = + OnceLock::new(); + let cache = CACHE.get_or_init(|| Mutex::new(BTreeMap::new())); + if let Some(v) = cache.lock().unwrap().get(enum_name) { + return v; + } + let v: &'static [&'static str] = + Box::leak(extract_option_phrases(enum_name).into_boxed_slice()); + cache.lock().unwrap().insert(enum_name, v); + v +} + +fn extract_option_phrases(enum_name: &str) -> Vec<&'static str> { + let needle = format!("impl AstDisplay for {enum_name}"); + for src in [DDL_SRC, STMT_SRC] { + let Some(start) = src.find(&needle) else { + continue; + }; + // Reject a match that's only a prefix of a longer enum name. + if !src[start + needle.len()..].starts_with([' ', '\n']) { + continue; + } + let Some(rel) = src[start..].find('{') else { + continue; + }; + let brace = start + rel; + // Brace-match to the end of the `impl … { … }` body. + let mut depth = 0i32; + let mut end = brace; + for (k, c) in src[brace..].char_indices() { + match c { + '{' => depth += 1, + '}' => { + depth -= 1; + if depth == 0 { + end = brace + k; + break; + } + } + _ => {} + } + } + let mut out = Vec::new(); + let mut rest = &src[brace..=end]; + while let Some(q) = rest.find('"') { + let tail = &rest[q + 1..]; + match tail.find('"') { + Some(e) => { + out.push(&tail[..e]); + rest = &tail[e + 1..]; + } + None => break, + } + } + return out; + } + Vec::new() +} + +/// Recursion depth budget for a generated query. +const MAX_DEPTH: u32 = 5; +/// Hard cap on generated query length. Structure-aware generation can otherwise +/// blow up: depth recursion compounds through subqueries/derived tables, and a +/// 2GB string OOMs the fuzzer. Once we cross this, `expr`/`from_item` collapse +/// to leaves so output overshoots by at most one in-flight expansion. +const MAX_OUTPUT: usize = 2000; + +struct Gen<'a, 'u> { + u: &'u mut Unstructured<'a>, + out: String, +} + +impl<'a, 'u> Gen<'a, 'u> { + fn budget(&self) -> bool { + self.out.len() < MAX_OUTPUT + } + + /// A uniform choice in `0..n` (0 when the byte stream is exhausted, so the + /// generator deterministically winds down to leaves at the end of input). + fn pick(&mut self, n: usize) -> usize { + if n <= 1 { + return 0; + } + self.u + .int_in_range(0..=(n as u64 - 1)) + .map(|v| v as usize) + .unwrap_or(0) + } + + /// `true` with probability `num/den` (false on exhaustion). + fn chance(&mut self, num: u32, den: u32) -> bool { + self.u.ratio(num, den).unwrap_or(false) + } + + fn one_of(&mut self, opts: &[&str]) { + let i = self.pick(opts.len()); + self.out.push_str(opts[i]); + } + + fn ident(&mut self) { + // 1 in 5: a *bare* keyword used as an identifier. The parser accepts the + // non-reserved ones, and whether the printer keeps them unambiguous is + // exactly the property under test (see `keywords`). Reserved keywords + // just make the statement fail to parse, a harmless no-op. + if self.chance(1, 5) { + let kw = keywords(); + let i = self.pick(kw.len()); + self.out.push_str(kw[i]); + } else { + self.one_of(IDENTS); + } + } + + /// Splice a bit of "noise" into the output: a noise token, a bare keyword, + /// or a short run of arbitrary characters. See `NOISE`. + fn inject_noise(&mut self) { + match self.pick(3) { + 0 => self.one_of(NOISE), + 1 => { + let kw = keywords(); + let i = self.pick(kw.len()); + self.out.push_str(kw[i]); + } + _ => { + // A short run of arbitrary characters. `data: &str` only ever + // delivers valid UTF-8, so `char` (a scalar value) is the right + // granularity for "random bytes" the parser could actually see. + let n = 1 + self.pick(4); + for _ in 0..n { + if let Ok(c) = char::arbitrary(self.u) { + self.out.push(c); + } + } + } + } + } + + /// With low probability, splice noise into the output. Called at generation + /// boundaries so most statements stay valid (and exercise the round trip) + /// while a steady minority are perturbed (and exercise parser robustness). + fn maybe_noise(&mut self) { + if self.budget() && self.chance(1, 12) { + self.inject_noise(); + } + } + + /// Full-vocabulary "soup": a random sequence drawn from every keyword, the + /// identifier set, every literal form, the operators/symbols, and raw + /// characters (via `inject_noise`). Almost none parses, but the rare soup + /// that does reaches query shapes the structured productions never assemble + /// (e.g. `COPY into …`, where a bare `into` relation re-lexed as the optional + /// `INTO` keyword), and the rest stresses the parser/lexer error paths. The + /// fraction of soup runs is tuned low (see `generate`) so the structured + /// path still does the bulk of the round-trip work. + fn soup(&mut self) { + let n = 1 + self.pick(40); + for _ in 0..n { + if !self.budget() { + break; + } + match self.pick(8) { + // Keywords are the bulk of the "words" and the richest + // parser-dispatch surface, so weight them up. + 0..=2 => { + let kw = keywords(); + let i = self.pick(kw.len()); + self.out.push_str(kw[i]); + } + 3 => self.ident(), + 4 => self.value(), + // Structural punctuation and operators. + 5 => self.one_of(&["(", ")", ",", ".", "::", ":", ";", "[", "]"]), + 6 => self.one_of(BIN_OPS), + // Noise tokens, bare keywords, or raw characters. + _ => self.inject_noise(), + } + // Usually a space, sometimes nothing, to test adjacent-token lexing. + if self.pick(4) != 0 { + self.out.push(' '); + } + } + } + + /// A possibly-qualified name: `a`, `a.b`, or `a.b.c`. + fn qualified_name(&mut self) { + let parts = 1 + self.pick(3); + for i in 0..parts { + if i > 0 { + self.out.push('.'); + } + self.ident(); + } + } + + fn data_type(&mut self, depth: u32) { + if depth > 0 && self.chance(1, 3) { + // Recursive array / list / map wrappers. + match self.pick(3) { + 0 => { + self.data_type(depth - 1); + self.out.push_str("[]"); + } + 1 => { + self.data_type(depth - 1); + self.out.push_str(" list"); + } + _ => { + self.out.push_str("map[text => "); + self.data_type(depth - 1); + self.out.push(']'); + } + } + } else { + self.one_of(SCALAR_TYPES); + } + } + + /// An expression, optionally wrapped in parentheses to force a non-default + /// grouping (the parens become `Nested`, which the oracle strips, so what's + /// actually under test is whether the printer reproduces the grouping). + fn expr_grouped(&mut self, depth: u32) { + if depth > 0 && self.chance(2, 5) { + self.out.push('('); + self.expr(depth); + self.out.push(')'); + } else { + self.expr(depth); + } + } + + fn expr(&mut self, depth: u32) { + if depth == 0 || !self.budget() { + self.leaf_expr(); + return; + } + let d = depth - 1; + match self.pick(19) { + // Binary operator chain. + 0 | 1 => { + self.expr_grouped(d); + self.out.push(' '); + self.one_of(BIN_OPS); + self.out.push(' '); + self.expr_grouped(d); + } + // Unary operators (incl. the negated-cast precedence trap). + 2 => { + self.one_of(&["-", "+", "NOT ", "~"]); + self.expr_grouped(d); + } + // `::` cast. + 3 => { + self.expr_grouped(d); + self.out.push_str("::"); + self.data_type(2); + } + // CAST(e AS t). + 4 => { + self.out.push_str("CAST("); + self.expr(d); + self.out.push_str(" AS "); + self.data_type(2); + self.out.push(')'); + } + // IS [NOT] {NULL|TRUE|FALSE|UNKNOWN} / IS [NOT] DISTINCT FROM. + 5 => { + self.expr_grouped(d); + self.out.push_str(" IS "); + if self.chance(1, 2) { + self.out.push_str("NOT "); + } + if self.chance(1, 2) { + self.one_of(&["NULL", "TRUE", "FALSE", "UNKNOWN"]); + } else { + self.out.push_str("DISTINCT FROM "); + self.expr_grouped(d); + } + } + // BETWEEN. + 6 => { + self.expr_grouped(d); + if self.chance(1, 3) { + self.out.push_str(" NOT"); + } + self.out.push_str(" BETWEEN "); + self.expr_grouped(d); + self.out.push_str(" AND "); + self.expr_grouped(d); + } + // [NOT] LIKE / ILIKE [ESCAPE ] / SIMILAR TO. + 7 => { + self.expr_grouped(d); + if self.chance(1, 3) { + self.out.push_str(" NOT"); + } + // SIMILAR TO has no ESCAPE in this AST. LIKE/ILIKE + // (`Expr::Like`) carries an optional `escape` the printer emits. + let kind = self.pick(3); + self.out + .push_str([" LIKE ", " ILIKE ", " SIMILAR TO "][kind]); + self.expr_grouped(d); + if kind != 2 && self.chance(1, 2) { + self.out.push_str(" ESCAPE "); + self.expr_grouped(d); + } + } + // Function call (plain / aggregate / window). + 8 | 9 => self.func_call(d), + // CASE. + 10 => self.case_expr(d), + // Quantified comparison: e op {ANY|ALL|SOME} (...). A subquery in + // the parens yields `Expr::AnySubquery`/`AllSubquery`. A single + // non-subquery expression yields the array-valued + // `Expr::AnyExpr`/`AllExpr` (a distinct printer branch). Both LHS + // forms go through `write_quantified_left`'s paren logic. + 11 => { + self.expr_grouped(d); + self.out.push(' '); + self.one_of(&["=", "<>", "<", ">", "<=", ">="]); + self.out.push(' '); + self.one_of(&["ANY", "ALL", "SOME"]); + self.out.push_str(" ("); + if self.chance(1, 2) { + self.query(d); + } else if self.chance(1, 2) { + // Array-shaped operand keeps the comparison well-typed. + self.out.push_str("ARRAY["); + self.expr_list(d, 1, 3); + self.out.push(']'); + } else { + self.expr(d); + } + self.out.push(')'); + } + // [NOT] IN (list) or IN (subquery), and EXISTS / scalar subquery. + 12 => { + if self.chance(1, 2) { + self.expr_grouped(d); + if self.chance(1, 3) { + self.out.push_str(" NOT"); + } + self.out.push_str(" IN ("); + if self.chance(1, 2) { + self.query(d); + } else { + self.expr_list(d, 1, 3); + } + self.out.push(')'); + } else if self.chance(1, 2) { + self.out.push_str("EXISTS ("); + self.query(d); + self.out.push(')'); + } else { + self.out.push('('); + self.query(d); + self.out.push(')'); + } + } + // Collection literals: ARRAY[...]/ARRAY(subquery), LIST[...], MAP[k=>v], ROW(...). + 13 => self.collection_expr(d), + // Special grammar forms. + 14 => self.special_form(d), + // `COLLATE`, postfix form (` COLLATE `), binds tightly. + 15 => { + self.expr_grouped(d); + self.out.push_str(" COLLATE "); + self.ident(); + } + // `AT TIME ZONE`, postfix form, desugars to the `timezone(...)` function. + 16 => { + self.expr_grouped(d); + self.out.push_str(" AT TIME ZONE "); + self.expr_grouped(d); + } + // Namespaced operator `OPERATOR(schema.op)`, distinct display path. + 17 => { + self.expr_grouped(d); + self.out.push_str(" OPERATOR(pg_catalog."); + self.one_of(&["+", "-", "*", "=", "<", ">", "@>", "->"]); + self.out.push_str(") "); + self.expr_grouped(d); + } + // Subscript / field access / tuple. + _ => match self.pick(4) { + 0 => { + self.expr_grouped(d); + self.out.push('['); + self.value(); + if self.chance(1, 2) { + self.out.push(':'); + self.value(); + } + self.out.push(']'); + } + 1 => { + // Subscript with full-expression bounds (not just literals). + self.expr_grouped(d); + self.out.push('['); + self.expr(d); + if self.chance(1, 2) { + self.out.push(':'); + self.expr(d); + } + self.out.push(']'); + } + 2 => { + self.out.push('('); + self.expr(d); + self.out.push_str(")."); + self.ident(); + } + _ => { + self.out.push('('); + self.expr_list(d, 2, 3); + self.out.push(')'); + } + }, + } + } + + fn leaf_expr(&mut self) { + match self.pick(4) { + 0 => self.qualified_name(), + 1 => self.value(), + // `$N` placeholder, `Expr::Parameter`, with its own `${n}` printer. + 2 => { + let n = 1 + self.pick(9); + self.out.push('$'); + self.out.push_str(&n.to_string()); + } + _ => { + self.out.push('*'); + // `*` alone is only valid as a projection, fall back to a column + // so a leaf is always a valid scalar. + self.out.pop(); + self.qualified_name(); + } + } + } + + fn value(&mut self) { + self.one_of(VALUES); + } + + /// A string literal, weighted toward the lexing/escaping edge cases (empty, + /// embedded escaped quote, embedded double quote, percent) so connector + /// option values that must be re-escaped on display are exercised, the + /// class of the CSR `MESSAGE 'a''b'` round-trip finding. + fn string_value(&mut self) { + self.one_of(&[ + "'t'", + "''", + "'foo bar'", + "'it''s'", + "'a\"b'", + "'%'", + "'1s'", + "'localhost:9092'", + ]); + } + + fn expr_list(&mut self, depth: u32, min: usize, max: usize) { + let n = min + self.pick(max - min + 1); + for i in 0..n { + if i > 0 { + self.out.push_str(", "); + } + self.expr(depth); + } + } + + fn func_call(&mut self, depth: u32) { + // Names that collide with special grammar / quantifier keywords stress + // the printer's disambiguating quoting. + self.one_of(&[ + "count", + "sum", + "max", + "min", + "abs", + "coalesce", + "\"some\"", + "\"any\"", + "\"coalesce\"", + "\"position\"", + "\"trim\"", + "\"array\"", + "\"row\"", + "generate_series", + "lower", + "f", + ]); + self.out.push('('); + if self.chance(1, 5) { + self.out.push('*'); + } else { + if self.chance(1, 5) { + self.out.push_str("DISTINCT "); + } + self.expr_list(depth, 0, 3); + if self.chance(1, 6) { + self.out.push_str(" ORDER BY "); + self.expr(depth); + } + } + self.out.push(')'); + if self.chance(1, 8) { + // Ordered-set aggregate: `f(args) WITHIN GROUP (ORDER BY …)`. + self.out.push_str(" WITHIN GROUP (ORDER BY "); + self.expr(depth); + if self.chance(1, 2) { + self.one_of(&[" ASC", " DESC"]); + } + self.out.push(')'); + } + if self.chance(1, 6) { + self.out.push_str(" FILTER (WHERE "); + self.expr(depth); + self.out.push(')'); + } + if self.chance(1, 4) { + if self.chance(1, 3) { + // A named-window reference (`OVER w`), resolves against a + // `WINDOW` clause. Parses fine on its own for round-trip. + self.out.push_str(" OVER "); + self.ident(); + } else { + self.window_spec(depth); + } + } + } + + fn window_spec(&mut self, depth: u32) { + self.out.push_str(" OVER "); + self.window_def(depth); + } + + fn window_def(&mut self, depth: u32) { + self.out.push('('); + if self.chance(1, 2) { + self.out.push_str("PARTITION BY "); + self.expr_list(depth, 1, 2); + self.out.push(' '); + } + if self.chance(2, 3) { + self.out.push_str("ORDER BY "); + self.expr(depth); + if self.chance(1, 2) { + self.one_of(&[" ASC", " DESC"]); + } + if self.chance(1, 2) { + self.one_of(&[" NULLS FIRST", " NULLS LAST"]); + } + // Frame. + if self.chance(1, 2) { + self.out.push(' '); + self.one_of(&["ROWS", "RANGE", "GROUPS"]); + self.out.push_str(" BETWEEN "); + self.frame_bound(); + self.out.push_str(" AND "); + self.frame_bound(); + } + } + self.out.push(')'); + } + + fn frame_bound(&mut self) { + match self.pick(5) { + 0 => self.out.push_str("UNBOUNDED PRECEDING"), + 1 => self.out.push_str("UNBOUNDED FOLLOWING"), + 2 => self.out.push_str("CURRENT ROW"), + 3 => { + self.value(); + self.out.push_str(" PRECEDING"); + } + _ => { + self.value(); + self.out.push_str(" FOLLOWING"); + } + } + } + + fn case_expr(&mut self, depth: u32) { + self.out.push_str("CASE"); + // Optional operand (simple CASE). + if self.chance(1, 2) { + self.out.push(' '); + self.expr(depth); + } + let arms = 1 + self.pick(2); + for _ in 0..arms { + self.out.push_str(" WHEN "); + self.expr(depth); + self.out.push_str(" THEN "); + self.expr(depth); + } + if self.chance(1, 2) { + self.out.push_str(" ELSE "); + self.expr(depth); + } + self.out.push_str(" END"); + } + + fn collection_expr(&mut self, depth: u32) { + match self.pick(4) { + 0 => { + if self.chance(1, 2) { + self.out.push_str("ARRAY["); + self.expr_list(depth, 0, 3); + self.out.push(']'); + } else { + self.out.push_str("ARRAY("); + self.query(depth); + self.out.push(')'); + } + } + 1 => { + if self.chance(1, 2) { + self.out.push_str("LIST["); + self.expr_list(depth, 0, 3); + self.out.push(']'); + } else { + // `LIST()`, `Expr::ListSubquery`, the subquery + // sibling of the `LIST[...]` literal. + self.out.push_str("LIST("); + self.query(depth); + self.out.push(')'); + } + } + 2 => { + self.out.push_str("MAP["); + let n = self.pick(3); + for i in 0..n { + if i > 0 { + self.out.push_str(", "); + } + self.value(); + self.out.push_str(" => "); + self.expr(depth); + } + self.out.push(']'); + } + _ => { + self.out.push_str("ROW("); + self.expr_list(depth, 0, 3); + self.out.push(')'); + } + } + } + + fn special_form(&mut self, depth: u32) { + match self.pick(9) { + 0 => { + self.out.push_str("EXTRACT("); + self.one_of(&["YEAR", "MONTH", "DAY", "HOUR", "EPOCH"]); + self.out.push_str(" FROM "); + self.expr(depth); + self.out.push(')'); + } + 1 => { + self.out.push_str("POSITION("); + self.expr(depth); + self.out.push_str(" IN "); + self.expr(depth); + self.out.push(')'); + } + 2 => { + self.out.push_str("SUBSTRING("); + self.expr(depth); + self.out.push_str(" FROM "); + self.value(); + if self.chance(1, 2) { + self.out.push_str(" FOR "); + self.value(); + } + self.out.push(')'); + } + 3 => { + self.out.push_str("TRIM("); + self.one_of(&["", "LEADING ", "TRAILING ", "BOTH "]); + self.expr(depth); + if self.chance(1, 2) { + self.out.push_str(" FROM "); + self.expr(depth); + } + self.out.push(')'); + } + 4 => { + self.out.push_str("COALESCE("); + self.expr_list(depth, 1, 3); + self.out.push(')'); + } + // GREATEST / LEAST, `Expr::HomogenizingFunction`, a dedicated AST + // node distinct from COALESCE with its own printer branch. + 5 => { + self.one_of(&["GREATEST(", "LEAST("]); + self.expr_list(depth, 1, 3); + self.out.push(')'); + } + // NULLIF(a, b), `Expr::NullIf`, also a dedicated node, not a call. + 6 => { + self.out.push_str("NULLIF("); + self.expr(depth); + self.out.push_str(", "); + self.expr(depth); + self.out.push(')'); + } + // NORMALIZE(e [, FORM]) desugars to a `normalize(...)` function + // call whose name the printer must re-quote to avoid re-triggering + // the NORMALIZE special grammar. + _ => { + self.out.push_str("NORMALIZE("); + self.expr(depth); + if self.chance(1, 2) { + self.out.push_str(", "); + self.one_of(&["NFC", "NFD", "NFKC", "NFKD"]); + } + self.out.push(')'); + } + } + } + + // --- Statement structure ----------------------------------------------- + + /// A top-level statement. Mostly bare queries (the richest surface), but + /// also the statement forms that wrap a query, exercising those statements' + /// own `AstDisplay`/pretty paths (`CREATE [MATERIALIZED] VIEW`, `INSERT`, + /// `EXPLAIN`, `SUBSCRIBE`, `DECLARE … CURSOR`) and the row-mutation DML. + fn statement(&mut self, depth: u32) { + self.maybe_noise(); + // ~1/4: the connector DDL family. CREATE {SOURCE,SINK,CONNECTION}, + // CREATE TABLE … FROM SOURCE, and the SHOW CREATE / EXPLAIN / ALTER / + // VALIDATE forms around them. These option-heavy statements are the + // richest printer surface the query-centric arms below never reach. + if self.chance(1, 4) { + self.connector_ddl(depth); + self.maybe_noise(); + return; + } + // A third of the remainder: a non-query statement form (DDL, the + // prepared-statement protocol, cursors, session commands), so the + // statement-level printers, and bare-keyword names in their special + // positions, get exercised too, not just queries. + if self.chance(1, 3) { + self.rare_statement(depth); + self.maybe_noise(); + return; + } + match self.pick(15) { + 0..=3 => self.query(depth), + 10 => { + // CREATE TABLE exercises the column-def + data-type printers. + self.out.push_str("CREATE TABLE "); + self.ident(); + self.out.push_str(" ("); + let cols = 1 + self.pick(3); + for i in 0..cols { + if i > 0 { + self.out.push_str(", "); + } + self.ident(); + self.out.push(' '); + self.data_type(2); + match self.pick(4) { + 0 => self.out.push_str(" NOT NULL"), + 1 => { + self.out.push_str(" DEFAULT "); + self.value(); + } + _ => {} + } + } + self.out.push(')'); + } + 4 => { + self.out.push_str("CREATE VIEW "); + self.ident(); + self.out.push_str(" AS "); + self.query(depth); + } + 5 => { + self.out.push_str("CREATE MATERIALIZED VIEW "); + self.ident(); + self.out.push_str(" AS "); + self.query(depth); + } + 6 => { + self.out.push_str("INSERT INTO "); + self.qualified_name(); + self.out.push(' '); + self.query(depth); + } + 7 => { + self.out.push_str("EXPLAIN "); + self.query(depth); + } + 8 => { + self.out.push_str("SUBSCRIBE ("); + self.query(depth); + self.out.push(')'); + } + 12 => { + // `TABLE `, a bare table query (distinct printer path). + self.out.push_str("TABLE "); + self.qualified_name(); + } + 13 => { + // COPY … TO STDOUT, the COPY statement printer. + self.out.push_str("COPY "); + self.qualified_name(); + self.out.push_str(" TO STDOUT"); + } + 14 => { + // SHOW COLUMNS, a simple SHOW-statement printer. + self.out.push_str("SHOW COLUMNS FROM "); + self.qualified_name(); + } + _ => { + // Row-mutation DML reuses the WHERE/SET expression generators. + if self.chance(1, 2) { + self.out.push_str("DELETE FROM "); + self.qualified_name(); + self.out.push_str(" WHERE "); + self.expr(depth); + } else { + self.out.push_str("UPDATE "); + self.qualified_name(); + self.out.push_str(" SET "); + self.ident(); + self.out.push_str(" = "); + self.expr(depth); + self.out.push_str(" WHERE "); + self.expr(depth); + } + } + } + } + + // --- Connector DDL (sources / sinks / connections / table-from-source) --- + + /// A comma-separated list of identifiers (so bare-keyword and quoted + /// collisions are exercised in option-list positions too). + fn ident_list(&mut self, min: usize, max: usize) { + let n = min + self.pick(max - min + 1); + for i in 0..n { + if i > 0 { + self.out.push_str(", "); + } + self.ident(); + } + } + + /// A simple, round-trip-proven `FORMAT ` clause. + fn format_clause(&mut self) { + self.out.push_str("FORMAT "); + self.one_of(&["BYTES", "TEXT", "JSON"]); + } + + /// A generic option value spanning the `WithOptionValue` variants the + /// permissive `parse_option_value` accepts: a string literal (with the + /// escaping edge cases), a number, a bool, an ident/item-name, or + /// `SECRET `. Any of these round-trips for a config option. + fn option_value(&mut self) { + match self.pick(6) { + 0 | 1 => self.string_value(), + 2 => { + let n = self.pick(1000); + self.out.push_str(&n.to_string()); + } + 3 => self.one_of(&["true", "false"]), + 4 => self.ident(), + _ => { + self.out.push_str("SECRET "); + self.qualified_name(); + } + } + } + + /// One `NAME [= value]` config-option clause. Most options take the generic + /// `option_value`. The handful with a dedicated value grammar the parser + /// dispatches by name are special-cased: `PARTITION BY` (an expression), + /// `RETAIN HISTORY` (`FOR ''`), `TEXT`/`EXCLUDE COLUMNS` (an ident + /// sequence), `BROKER` (a broker string), and the `… CONNECTION` / `SSH + /// TUNNEL` object references (an item name). + fn config_option(&mut self, name: &str) { + self.out.push_str(name); + match name { + // PARTITION BY has two value grammars by context: Kafka sink takes a + // full expression, table-from-source a generic value. A bare number + // is valid in both, a richer expression exercises the Kafka-sink + // printer (and just no-ops for table-from-source). + "PARTITION BY" => { + self.out.push_str(" = "); + if self.chance(1, 2) { + let n = self.pick(100); + self.out.push_str(&n.to_string()); + } else { + self.expr(2); + } + } + "RETAIN HISTORY" => self.out.push_str(" FOR '1s'"), + "TEXT COLUMNS" | "EXCLUDE COLUMNS" => { + self.out.push_str(" = ("); + self.ident_list(1, 3); + self.out.push(')'); + } + "BROKER" => self.out.push_str(" 'localhost:9092'"), + "AWS CONNECTION" | "GCP CONNECTION" | "SSH TUNNEL" => { + self.out.push_str(" = "); + self.ident(); + } + // Generic: usually a value, occasionally none (exercising the + // value-less `value: None` print path, which omits the ` = …`). + _ => { + if self.chance(9, 10) { + self.out.push_str(" = "); + self.option_value(); + } + } + } + } + + /// ` (opt, opt, …)` for an order-stable distinct subset of `names` (stable + /// order ⇒ the printed option order matches, so it reparses identically). + /// Emits nothing when the subset is empty unless `require_one` (some + /// statements allow an absent list, others require a non-empty one). + fn config_option_list(&mut self, names: &[&str], require_one: bool) { + // Defensive: if source-scraping ever yields no names, emit no list + // rather than indexing an empty slice below. + if names.is_empty() { + return; + } + let mut included: Vec = (0..names.len()) + .filter(|_| self.budget() && self.chance(1, 2)) + .collect(); + if included.is_empty() { + if require_one { + included.push(self.pick(names.len())); + } else { + return; + } + } + self.out.push_str(" ("); + for (j, &i) in included.iter().enumerate() { + if j > 0 { + self.out.push_str(", "); + } + self.config_option(names[i]); + } + self.out.push(')'); + } + + /// The multi-output source table-selection clause. + fn for_tables(&mut self) { + match self.pick(3) { + 0 => self.out.push_str(" FOR ALL TABLES"), + 1 => { + self.out.push_str(" FOR TABLES ("); + self.qualified_name(); + if self.chance(1, 2) { + self.out.push_str(", "); + self.qualified_name(); + } + self.out.push(')'); + } + _ => { + self.out.push_str(" FOR SCHEMAS ("); + self.ident(); + self.out.push(')'); + } + } + } + + /// An optional `INCLUDE , …` clause (distinct, order-stable). + fn include_metadata(&mut self) { + let opts = ["KEY", "PARTITION", "OFFSET", "TIMESTAMP", "HEADERS"]; + let included: Vec = (0..opts.len()).filter(|_| self.chance(1, 3)).collect(); + if included.is_empty() { + return; + } + self.out.push_str(" INCLUDE "); + for (j, &i) in included.iter().enumerate() { + if j > 0 { + self.out.push_str(", "); + } + self.out.push_str(opts[i]); + // KEY / PARTITION can carry an optional alias. + if (opts[i] == "KEY" || opts[i] == "PARTITION") && self.chance(1, 2) { + self.out.push_str(" AS "); + self.ident(); + } + } + } + + /// The option-heavy connector statements plus the SHOW CREATE / EXPLAIN / + /// ALTER / VALIDATE forms around them. Every shape here is one the + /// parser+printer round-trip is *proven* to preserve (it mirrors the + /// canonical forms in `sql-parser/tests/testdata`). The generator varies the + /// names (`ident`/`qualified_name`, so bare-keyword and quoted collisions + /// are exercised in each position) and the embedded queries (`query`), which + /// is where the novel coverage comes from. + fn connector_ddl(&mut self, depth: u32) { + match self.pick(9) { + 0 | 1 => self.create_table_from_source(), + 2 => self.create_source(), + 3 => self.create_sink(), + 4 => self.create_connection(), + 5 => { + // CREATE … FROM WEBHOOK. + self.out.push_str("CREATE SOURCE "); + self.ident(); + self.out.push_str(" IN CLUSTER "); + self.ident(); + self.out.push_str(" FROM WEBHOOK BODY FORMAT "); + self.one_of(&["JSON", "BYTES", "TEXT"]); + } + 6 => { + // SHOW CREATE . + self.out.push_str("SHOW CREATE "); + self.one_of(&[ + "SOURCE ", + "SINK ", + "TABLE ", + "VIEW ", + "MATERIALIZED VIEW ", + "INDEX ", + "CONNECTION ", + ]); + self.qualified_name(); + } + 7 => { + // EXPLAIN FOR , the EXPLAIN-statement printer. + self.out.push_str("EXPLAIN "); + self.one_of(&["OPTIMIZED PLAN FOR ", "TIMESTAMP FOR "]); + self.query(depth); + } + _ => { + // ALTER SOURCE … ADD SUBSOURCE, and VALIDATE CONNECTION. + if self.chance(1, 2) { + self.out.push_str("ALTER SOURCE "); + self.qualified_name(); + self.out.push_str(" ADD SUBSOURCE "); + self.ident_list(1, 3); + if self.chance(1, 2) { + self.out.push_str(" WITH"); + self.config_option_list(option_names("AlterSourceAddSubsourceOptionName"), true); + } + } else { + self.out.push_str("VALIDATE CONNECTION "); + self.qualified_name(); + } + } + } + } + + /// `CREATE TABLE … FROM SOURCE`, the subsource statement (database-issues + /// #10034 family). Exercises the optional column-or-constraint spec, the + /// `REFERENCE` external reference, and the `WITH (…)` purification options. + fn create_table_from_source(&mut self) { + self.out.push_str("CREATE TABLE "); + if self.chance(1, 6) { + self.out.push_str("IF NOT EXISTS "); + } + self.qualified_name(); + // Optional column / constraint spec. A typed column makes the spec + // `Defined`, a bare column makes it `Named`. Both print and reparse. + if self.chance(2, 3) { + self.out.push('('); + let cols = self.pick(3); + // Columns are either all bare (`Named`) or all typed (`Defined`). + // Mixing the two is a parse error ("cannot mix column definitions + // and column names"). + let typed = self.chance(1, 2); + let mut wrote = false; + for _ in 0..cols { + if wrote { + self.out.push_str(", "); + } + self.ident(); + if typed { + self.out.push(' '); + self.data_type(2); + } + wrote = true; + } + if self.chance(1, 3) { + if wrote { + self.out.push_str(", "); + } + self.out.push_str("PRIMARY KEY ("); + self.ident(); + self.out.push(')'); + wrote = true; + } + // `()` is not valid, guarantee at least one element. + if !wrote { + self.ident(); + } + self.out.push(')'); + } + self.out.push_str(" FROM SOURCE "); + self.qualified_name(); + if self.chance(3, 4) { + self.out.push_str(" (REFERENCE = "); + self.qualified_name(); + self.out.push(')'); + } + if self.chance(1, 2) { + self.out.push_str(" WITH"); + self.config_option_list(option_names("TableFromSourceOptionName"), true); + } + } + + /// `CREATE SOURCE` over every connector kind (load generator, Kafka, + /// Postgres, MySQL, SQL Server), each with its full config-option space, and + /// the source-level `WITH (…)` options. + fn create_source(&mut self) { + self.out.push_str("CREATE SOURCE "); + if self.chance(1, 6) { + self.out.push_str("IF NOT EXISTS "); + } + self.qualified_name(); + if self.chance(1, 2) { + self.out.push_str(" IN CLUSTER "); + self.ident(); + } + match self.pick(5) { + 0 => { + // Load generator, self-contained, needs no connection. Option + // and generator-kind validity is a planning concern, any pairing + // parses and round-trips. + self.out.push_str(" FROM LOAD GENERATOR "); + self.one_of(&[ + "COUNTER", + "CLOCK", + "AUCTION", + "MARKETING", + "TPCH", + "KEY VALUE", + ]); + self.config_option_list(option_names("LoadGeneratorOptionName"), false); + } + 1 => { + // Kafka, connection-backed, with FORMAT / INCLUDE / ENVELOPE. + self.out.push_str(" FROM KAFKA CONNECTION "); + self.qualified_name(); + self.config_option_list(option_names("KafkaSourceConfigOptionName"), false); + if self.chance(2, 3) { + self.out.push(' '); + self.format_clause(); + } + self.include_metadata(); + if self.chance(1, 2) { + self.out.push_str(" ENVELOPE "); + self.one_of(&["NONE", "UPSERT", "DEBEZIUM"]); + } + } + 2 => { + self.out.push_str(" FROM POSTGRES CONNECTION "); + self.qualified_name(); + self.config_option_list(option_names("PgConfigOptionName"), true); + self.for_tables(); + } + 3 => { + self.out.push_str(" FROM MYSQL CONNECTION "); + self.qualified_name(); + self.config_option_list(option_names("MySqlConfigOptionName"), false); + self.for_tables(); + } + _ => { + self.out.push_str(" FROM SQL SERVER CONNECTION "); + self.qualified_name(); + self.config_option_list(option_names("SqlServerConfigOptionName"), false); + self.for_tables(); + } + } + if self.chance(1, 3) { + self.out.push_str(" WITH"); + self.config_option_list(option_names("CreateSourceOptionName"), true); + } + } + + /// `CREATE SINK … INTO KAFKA`, the full sink config-option space, optional + /// KEY, FORMAT, ENVELOPE, and the sink-level `WITH (…)` options. + fn create_sink(&mut self) { + self.out.push_str("CREATE SINK "); + if self.chance(1, 6) { + self.out.push_str("IF NOT EXISTS "); + } + // The sink name is optional in the grammar, emit it most of the time. + if self.chance(3, 4) { + self.qualified_name(); + } + if self.chance(1, 2) { + self.out.push_str(" IN CLUSTER "); + self.ident(); + } + self.out.push_str(" FROM "); + self.qualified_name(); + self.out.push_str(" INTO KAFKA CONNECTION "); + self.qualified_name(); + self.config_option_list(option_names("KafkaSinkConfigOptionName"), false); + if self.chance(1, 3) { + self.out.push_str(" KEY ("); + self.ident_list(1, 2); + self.out.push(')'); + } + self.out.push(' '); + self.format_clause(); + if self.chance(1, 2) { + self.out.push_str(" ENVELOPE "); + self.one_of(&["UPSERT", "DEBEZIUM"]); + } + if self.chance(1, 3) { + self.out.push_str(" WITH"); + self.config_option_list(option_names("CreateSinkOptionName"), true); + } + } + + /// `CREATE CONNECTION … TO (…)` over the full connection option + /// space. The parser's option grammar is unified across connection types + /// (type/option compatibility is a planning concern), so any option subset + /// parses and round-trips under any `TO `. + fn create_connection(&mut self) { + self.out.push_str("CREATE CONNECTION "); + if self.chance(1, 6) { + self.out.push_str("IF NOT EXISTS "); + } + self.qualified_name(); + self.out.push_str(" TO "); + self.one_of(&[ + "KAFKA", + "CONFLUENT SCHEMA REGISTRY", + "POSTGRES", + "MYSQL", + "SQL SERVER", + "AWS", + "SSH TUNNEL", + ]); + self.config_option_list(option_names("ConnectionOptionName"), true); + if self.chance(1, 8) { + self.out.push_str(" WITH (VALIDATE = "); + self.one_of(&["true", "false"]); + self.out.push(')'); + } + } + + /// Non-query statement forms the query-centric `statement()` arm doesn't + /// reach: the prepared-statement protocol, cursors, session/transaction + /// commands, and assorted DDL. Names go through `ident()`/`qualified_name()` + /// so a bare-keyword collision is tested in each statement's special + /// position (where the optional keyword / clause makes quoting matter). + fn rare_statement(&mut self, depth: u32) { + match self.pick(38) { + 0 => { + self.out.push_str("DEALLOCATE "); + if self.chance(1, 2) { + self.out.push_str("PREPARE "); + } + if self.chance(1, 5) { + self.out.push_str("ALL"); + } else { + self.ident(); + } + } + 1 => { + self.out.push_str("PREPARE "); + self.ident(); + self.out.push_str(" AS "); + self.query(depth); + } + 2 => { + self.out.push_str("EXECUTE "); + self.ident(); + if self.chance(1, 2) { + self.out.push_str(" ("); + self.expr(depth); + self.out.push(')'); + } + } + 3 => { + self.out.push_str("DECLARE "); + self.ident(); + self.out.push_str(" CURSOR FOR "); + self.query(depth); + } + 4 => { + self.out.push_str("FETCH "); + if self.chance(1, 2) { + self.out.push_str("ALL "); + } + self.out.push_str("FROM "); + self.ident(); + } + 5 => { + self.out.push_str("CLOSE "); + self.ident(); + } + 6 => { + self.out.push_str("SET "); + self.ident(); + self.out.push_str(" TO "); + self.value(); + } + 7 => { + self.out.push_str("RESET "); + self.ident(); + } + 8 => { + self.out.push_str("SHOW "); + self.ident(); + } + 9 => { + self.out.push_str("DROP TABLE "); + self.qualified_name(); + } + 10 => { + self.out.push_str("DROP VIEW "); + self.qualified_name(); + } + 11 => { + self.out.push_str("COMMENT ON TABLE "); + self.qualified_name(); + self.out.push_str(" IS 'c'"); + } + 12 => { + self.out.push_str("GRANT SELECT ON TABLE "); + self.qualified_name(); + self.out.push_str(" TO "); + self.ident(); + } + 13 => { + self.out.push_str("REVOKE SELECT ON TABLE "); + self.qualified_name(); + self.out.push_str(" FROM "); + self.ident(); + } + 14 => { + self.out.push_str("ALTER TABLE "); + self.qualified_name(); + self.out.push_str(" RENAME TO "); + self.ident(); + } + 15 => { + self.out.push_str("CREATE INDEX "); + self.ident(); + self.out.push_str(" ON "); + self.qualified_name(); + self.out.push_str(" ("); + self.ident(); + self.out.push(')'); + } + 16 => self.out.push_str("BEGIN"), + 17 => self.out.push_str("COMMIT"), + 18 => self.out.push_str("ROLLBACK"), + 19 => { + self.out.push_str("CREATE DATABASE "); + self.ident(); + } + 20 => { + self.out.push_str("CREATE SCHEMA "); + self.qualified_name(); + } + 21 => { + self.out.push_str("CREATE ROLE "); + self.ident(); + } + 22 => { + self.out.push_str("DROP DATABASE "); + self.ident(); + } + 23 => { + self.out.push_str("DROP SCHEMA "); + self.qualified_name(); + } + 24 => { + self.out.push_str("DROP CLUSTER "); + self.ident(); + } + 25 => { + self.out.push_str("ALTER TABLE "); + self.qualified_name(); + self.out.push_str(" OWNER TO "); + self.ident(); + } + 26 => { + self.out.push_str("ALTER TABLE "); + self.qualified_name(); + self.out.push_str(" ADD COLUMN "); + self.ident(); + self.out.push(' '); + self.data_type(2); + } + 27 => { + self.out.push_str("SHOW CREATE TABLE "); + self.qualified_name(); + } + 28 => { + self.out.push_str("CREATE SECRET "); + self.ident(); + self.out.push_str(" AS 'secret'"); + } + 29 => { + self.out.push_str("ALTER SYSTEM SET "); + self.ident(); + self.out.push_str(" = "); + self.value(); + } + 30 => { + self.out.push_str("ALTER SYSTEM RESET "); + self.ident(); + } + 31 => { + // Role-membership grant/revoke (distinct from the privilege form). + self.out.push_str("GRANT "); + self.ident(); + self.out.push_str(" TO "); + self.ident(); + } + 32 => { + self.out.push_str("REVOKE "); + self.ident(); + self.out.push_str(" FROM "); + self.ident(); + } + 33 => { + self.out.push_str("DROP TABLE IF EXISTS "); + self.qualified_name(); + } + 34 => { + self.out.push_str("CREATE TYPE "); + self.ident(); + self.out.push_str(" AS LIST (ELEMENT TYPE = "); + self.data_type(1); + self.out.push(')'); + } + 35 => { + self.out.push_str("CREATE TYPE "); + self.ident(); + self.out.push_str(" AS MAP (KEY TYPE = text, VALUE TYPE = "); + self.data_type(1); + self.out.push(')'); + } + 36 => { + self.out.push_str("ALTER INDEX "); + self.qualified_name(); + self.out.push_str(" RENAME TO "); + self.ident(); + } + _ => { + self.out.push_str("CREATE OR REPLACE VIEW "); + self.ident(); + self.out.push_str(" AS "); + self.query(depth); + } + } + } + + // --- Query / SELECT structure ------------------------------------------ + + fn query(&mut self, depth: u32) { + // Optional CTEs, plain `WITH` or mz's `WITH MUTUALLY RECURSIVE` + // (which declares each CTE's output column types: a distinct grammar). + if depth > 0 && self.chance(1, 4) { + let mutually_recursive = self.chance(1, 3); + self.out.push_str(if mutually_recursive { + "WITH MUTUALLY RECURSIVE " + } else { + "WITH " + }); + let n = 1 + self.pick(2); + for i in 0..n { + if i > 0 { + self.out.push_str(", "); + } + self.ident(); + if mutually_recursive { + self.out.push('('); + let cols = 1 + self.pick(2); + for j in 0..cols { + if j > 0 { + self.out.push_str(", "); + } + self.ident(); + self.out.push(' '); + self.data_type(2); + } + self.out.push(')'); + } + self.out.push_str(" AS ("); + self.set_expr(depth - 1); + self.out.push(')'); + } + self.out.push(' '); + } + self.set_expr(depth); + // ORDER BY / LIMIT / OFFSET. + if self.chance(1, 2) { + self.out.push_str(" ORDER BY "); + let n = 1 + self.pick(2); + for i in 0..n { + if i > 0 { + self.out.push_str(", "); + } + self.expr(depth.saturating_sub(1)); + if self.chance(1, 2) { + self.one_of(&[" ASC", " DESC"]); + } + } + } + if self.chance(1, 3) { + self.out.push_str(" LIMIT "); + self.value(); + } + if self.chance(1, 4) { + self.out.push_str(" OFFSET "); + self.value(); + } + } + + fn set_expr(&mut self, depth: u32) { + if depth > 0 && self.chance(1, 4) { + // Set operation between two query bodies. + self.select(depth - 1); + self.out.push(' '); + self.one_of(&["UNION", "INTERSECT", "EXCEPT"]); + if self.chance(1, 2) { + self.out.push_str(" ALL"); + } + self.out.push(' '); + self.select(depth - 1); + } else if self.chance(1, 6) { + // VALUES. + self.out.push_str("VALUES "); + let rows = 1 + self.pick(2); + for i in 0..rows { + if i > 0 { + self.out.push_str(", "); + } + self.out.push('('); + self.expr_list(depth.saturating_sub(1), 1, 3); + self.out.push(')'); + } + } else { + self.select(depth); + } + } + + fn select(&mut self, depth: u32) { + let d = depth.saturating_sub(1); + self.out.push_str("SELECT "); + if self.chance(1, 6) { + self.out.push_str("DISTINCT "); + if self.chance(1, 2) { + self.out.push_str("ON ("); + self.expr_list(d, 1, 2); + self.out.push_str(") "); + } + } + // Projection. + let cols = 1 + self.pick(3); + for i in 0..cols { + if i > 0 { + self.out.push_str(", "); + } + if self.chance(1, 6) { + self.out.push('*'); + } else { + self.expr(d); + if self.chance(1, 3) { + self.out.push_str(" AS "); + self.ident(); + } + } + } + // FROM. + if self.chance(3, 4) { + self.out.push_str(" FROM "); + self.from_item(d); + // Comma joins (bounded so abundant input can't explode the width). + let extra = self.pick(3); + for _ in 0..extra { + if !self.budget() { + break; + } + self.out.push_str(", "); + self.from_item(d); + } + } + if self.chance(1, 2) { + self.out.push_str(" WHERE "); + self.expr(d); + } + if self.chance(1, 3) { + self.out.push_str(" GROUP BY "); + self.group_by(d); + } + if self.chance(1, 4) { + self.out.push_str(" HAVING "); + self.expr(d); + } + // Named WINDOW clause (`WINDOW w AS (…)`), referenced by `OVER w`. + if self.chance(1, 6) { + self.out.push_str(" WINDOW "); + let n = 1 + self.pick(2); + for i in 0..n { + if i > 0 { + self.out.push_str(", "); + } + self.ident(); + self.out.push_str(" AS "); + self.window_def(d); + } + } + } + + fn group_by(&mut self, depth: u32) { + match self.pick(4) { + 0 | 1 => self.expr_list(depth, 1, 3), + 2 => { + self.out.push_str("GROUPING SETS ("); + self.out.push('('); + self.expr_list(depth, 1, 2); + self.out.push_str("), ("); + self.expr_list(depth, 1, 2); + self.out.push_str("))"); + } + _ => { + self.one_of(&["ROLLUP", "CUBE"]); + self.out.push_str(" ("); + self.expr_list(depth, 1, 3); + self.out.push(')'); + } + } + } + + fn from_item(&mut self, depth: u32) { + if !self.budget() { + self.qualified_name(); + return; + } + if self.chance(1, 3) { + self.out.push_str("LATERAL "); + } + match self.pick(4) { + 0 | 1 => { + self.qualified_name(); + } + 2 if depth > 0 => { + // Derived table. + self.out.push('('); + self.query(depth - 1); + self.out.push(')'); + self.alias(); + } + _ => { + // Table function. + self.one_of(&["generate_series", "unnest", "\"row\"", "f"]); + self.out.push('('); + self.expr_list(depth.saturating_sub(1), 0, 2); + self.out.push(')'); + } + } + // Optional alias for the simple cases too. + if self.chance(1, 3) { + self.alias(); + } + // Optional join. + if depth > 0 && self.chance(1, 3) { + self.join_op(); + self.from_item(depth - 1); + match self.pick(3) { + 0 => { + self.out.push_str(" ON "); + self.expr(depth - 1); + } + 1 => { + self.out.push_str(" USING ("); + self.ident(); + self.out.push(')'); + } + _ => {} + } + } + } + + fn join_op(&mut self) { + self.one_of(&[ + " JOIN ", + " INNER JOIN ", + " LEFT JOIN ", + " RIGHT JOIN ", + " FULL OUTER JOIN ", + " CROSS JOIN ", + ]); + } + + fn alias(&mut self) { + self.out.push_str(" AS "); + self.ident(); + if self.chance(1, 3) { + self.out.push_str(" ("); + let n = 1 + self.pick(2); + for i in 0..n { + if i > 0 { + self.out.push_str(", "); + } + self.ident(); + } + self.out.push(')'); + } + } +} + +/// Build a query string from the byte stream. +fn generate(data: &[u8]) -> String { + let mut u = Unstructured::new(data); + let mut g = Gen { + u: &mut u, + out: String::new(), + }; + // Mostly a structured statement (deep + valid, the round-trip the printer + // must preserve). ~1/6 of runs are full-vocabulary soup (reaching shapes the + // grammar doesn't enumerate + the parser/lexer error paths). One oracle for + // both. + if g.chance(1, 6) { + g.soup(); + } else { + g.statement(MAX_DEPTH); + } + // Trailing noise exercises the parser's handling of unexpected tokens + // after an otherwise-complete statement. + g.maybe_noise(); + g.out +} + +fuzz_target!(|data: &[u8]| { + let sql = generate(data); + let Ok(stmts) = parse_statements(&sql) else { + return; + }; + if stmts.len() != 1 { + return; + } + let orig_ast = stmts.into_iter().next().unwrap().ast; + + // Display oracle works on the unnormalized AST (stable-string equality). + check_display(&orig_ast); + + // Pretty oracle compares normalized ASTs for full structural equality. + let mut normalized = orig_ast; + normalize(&mut normalized); + check_pretty(&sql, &normalized); +}); diff --git a/src/sql-parser/fuzz/grammar/sql.g b/src/sql-parser/fuzz/grammar/sql.g new file mode 100644 index 0000000000000..ab74f90aa0ab4 --- /dev/null +++ b/src/sql-parser/fuzz/grammar/sql.g @@ -0,0 +1,310 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +# Grammar for Materialize SQL round-trip fuzzing. +# +# `build.rs` compiles this file into a rule table (`$OUT_DIR/grammar.rs`); the +# `grammar` fuzz target walks the table, mapping libFuzzer bytes to production +# choices, then checks that `parse -> print -> reparse` preserves the AST. The +# grammar is the single source of truth and is compiled at build time, so there +# is no generated file to keep in sync. Edit this file and rebuild. +# +# Format: +# = | | ... ; +# An is a whitespace-separated sequence of items; an item is one of: +# "literal" a terminal, emitted verbatim (may contain | ; = inside quotes) +# other_rule a reference to another rule (recursively expanded) +# @ident a generated identifier (bare name or quoted collision) +# @kw a bare keyword used as an identifier (printer-quoting stress) +# @int a small integer literal +# @str a string literal (with lexing/escaping edge cases) +# `#` starts a comment line. The start symbol is the first rule (`statement`). +# Bias is expressed by repeating an alternative (each alt is chosen uniformly). + +statement = + query + | query + | query + | "SELECT " select_list + | query " UNION " set_rhs + | query " UNION ALL " set_rhs + | query " INTERSECT " set_rhs + | query " EXCEPT " set_rhs + | "INSERT INTO " table_name " " query + | "CREATE VIEW " ident " AS " query + | "CREATE MATERIALIZED VIEW " ident " AS " query + | "EXPLAIN " query + | "SUBSCRIBE (" query ")" + | "DELETE FROM " table_name " WHERE " expr + | "UPDATE " table_name " SET " ident " = " expr " WHERE " expr + ; + +set_rhs = + query + | "SELECT " select_list + ; + +query = + "SELECT " distinct_opt select_list " FROM " from_item where_opt group_opt having_opt order_opt limit_opt + ; + +distinct_opt = + "" + | "" + | "DISTINCT " + ; + +select_list = + "*" + | select_item + | select_item ", " select_list + ; + +select_item = + expr + | expr " AS " ident + ; + +from_item = + table_name + | table_name + | table_name " AS " ident + | from_item " JOIN " from_item " ON " expr + | from_item " LEFT JOIN " from_item " ON " expr + | from_item " CROSS JOIN " from_item + | "(" query ") AS " ident + ; + +table_name = + ident + | ident "." ident + ; + +where_opt = + "" + | "" + | " WHERE " expr + ; + +group_opt = + "" + | "" + | " GROUP BY " expr_list + ; + +having_opt = + "" + | " HAVING " expr + ; + +order_opt = + "" + | "" + | " ORDER BY " order_list + ; + +order_list = + expr order_dir + | expr order_dir ", " order_list + ; + +order_dir = + "" + | " ASC" + | " DESC" + | " ASC NULLS LAST" + | " DESC NULLS FIRST" + ; + +limit_opt = + "" + | "" + | " LIMIT " @int + | " LIMIT " @int " OFFSET " @int + ; + +expr_list = + expr + | expr ", " expr_list + ; + +expr = + leaf + | leaf + | expr " " binop " " expr + | unop expr + | "(" expr ")" + | expr "::" type + | "CAST(" expr " AS " type ")" + | expr " IS NULL" + | expr " IS NOT NULL" + | expr " IS DISTINCT FROM " expr + | expr " BETWEEN " expr " AND " expr + | expr " IN (" expr_list ")" + | expr " IN (" query ")" + | expr " NOT IN (" expr_list ")" + | "EXISTS (" query ")" + | expr " LIKE " expr + | expr " LIKE " expr " ESCAPE " expr + | expr " ILIKE " expr + | expr quant_op " " quant " (" query ")" + | expr quant_op " " quant " (ARRAY[" expr_list "])" + | func_call + | case_expr + | "ARRAY[" expr_list "]" + | "ARRAY(" query ")" + | "LIST[" expr_list "]" + | "ROW(" expr_list ")" + | "(" query ")" + # Postfix forms: `COLLATE`/`AT TIME ZONE` (loose, `PostfixCollateAt`) and + # subscript/field-access (tight, `PostfixSubscriptCast`). These stress the + # printer's receiver/operand parenthesization the most. + | expr " COLLATE " collation + | expr " AT TIME ZONE " expr + | expr subscript + | "(" expr ")." ident + | "(" expr ").*" + | "MAP[" map_entries "]" + | "MAP(" query ")" + | "NULLIF(" expr ", " expr ")" + | special_func + | typed_literal + ; + +leaf = + ident + | qualified + | @int + | @str + | "true" + | "false" + | "null" + | param + ; + +qualified = + ident "." ident + ; + +param = + "$" @int + ; + +quant = + "ANY" + | "ALL" + | "SOME" + ; + +quant_op = + " =" + | " <>" + | " <" + | " >" + | " <=" + | " >=" + ; + +binop = + "+" | "-" | "*" | "/" | "%" | "||" | "=" | "<>" | "!=" | "<" | ">" | "<=" | ">=" + | "AND" | "OR" | "->" | "->>" | "#>" | "#>>" | "@>" | "<@" + ; + +unop = + "-" | "+" | "NOT " | "~" + ; + +type = + "int4" | "int8" | "integer" | "text" | "boolean" | "double precision" | "numeric" + | "numeric(10, 2)" | "date" | "timestamp" | "timestamptz" | "jsonb" | "uuid" + | type "[]" + | type " list" + ; + +func_call = + func_name "(" ")" + | func_name "(" expr_list ")" + | func_name "(DISTINCT " expr_list ")" + | func_name "(" expr_list ") OVER ()" + | func_name "(" expr_list ") OVER (PARTITION BY " expr_list ")" + | func_name "(" expr_list ") OVER (ORDER BY " order_list " " frame ")" + | func_name "(" expr_list ") FILTER (WHERE " expr ")" + | "count(*)" + ; + +frame = + "ROWS BETWEEN " frame_bound " AND " frame_bound + | "RANGE BETWEEN " frame_bound " AND " frame_bound + | "GROUPS BETWEEN " frame_bound " AND " frame_bound + | "ROWS " frame_bound + ; + +frame_bound = + "UNBOUNDED PRECEDING" + | "CURRENT ROW" + | @int " PRECEDING" + | @int " FOLLOWING" + | "UNBOUNDED FOLLOWING" + ; + +collation = + ident + | ident "." ident + ; + +subscript = + "[" expr "]" + | "[" expr ":" expr "]" + | "[" expr ":]" + | "[:" expr "]" + ; + +map_entries = + expr " => " expr + | expr " => " expr ", " map_entries + ; + +special_func = + "position(" expr " IN " expr ")" + | "extract(" extract_field " FROM " expr ")" + | "substring(" expr " FROM " expr ")" + | "substring(" expr " FROM " expr " FOR " expr ")" + | "trim(" expr ")" + | "trim(BOTH " expr " FROM " expr ")" + | "trim(LEADING " expr " FROM " expr ")" + ; + +extract_field = + "'year'" | "'month'" | "'day'" | "'hour'" | "'epoch'" | "'dow'" + ; + +typed_literal = + "DATE " @str + | "TIMESTAMP " @str + | "TIME " @str + | "INTERVAL " @str + ; + +func_name = + "count" | "sum" | "max" | "min" | "abs" | "lower" | "upper" | "coalesce" + | "length" | "greatest" | "least" | "generate_series" | "f" + ; + +case_expr = + "CASE WHEN " expr " THEN " expr " ELSE " expr " END" + | "CASE WHEN " expr " THEN " expr " END" + | "CASE " expr " WHEN " expr " THEN " expr " ELSE " expr " END" + ; + +# Identifiers, biased ~4:1 toward ordinary names over bare keywords (each +# alternative is equally likely); the bare-keyword case is the printer-quoting +# stressor (does the printer keep a keyword-as-identifier unambiguous?). +ident = + @ident | @ident | @ident | @ident | @kw + ; diff --git a/src/sql-server-util/fuzz/.gitignore b/src/sql-server-util/fuzz/.gitignore new file mode 100644 index 0000000000000..ab0eaa1a49031 --- /dev/null +++ b/src/sql-server-util/fuzz/.gitignore @@ -0,0 +1,5 @@ +target/ +corpus/ +artifacts/ +coverage/ +Cargo.lock diff --git a/src/sql-server-util/fuzz/Cargo.toml b/src/sql-server-util/fuzz/Cargo.toml new file mode 100644 index 0000000000000..7d3da5add70a5 --- /dev/null +++ b/src/sql-server-util/fuzz/Cargo.toml @@ -0,0 +1,28 @@ +# Fuzz crate for mz-sql-server-util desc proto round-trip. +# `SqlServerTableDesc` describes external-database schemas, so a decoder bug +# here is reachable from a compromised upstream SQL Server or from on-disk +# catalog bytes. + +[package] +workspace = "../../../test/cargo-fuzz" +name = "mz-sql-server-util-fuzz" +version = "0.0.0" +publish = false +edition = "2021" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" +mz-sql-server-util = { path = "..", features = ["proptest"] } +mz-proto = { path = "../../proto" } +prost = "0.14.3" +proptest = { version = "1.11.0", default-features = false, features = ["std"] } + +[[bin]] +name = "sql_server_table_desc_proto_roundtrip" +path = "fuzz_targets/sql_server_table_desc_proto_roundtrip.rs" +test = false +doc = false +bench = false diff --git a/src/sql-server-util/fuzz/fuzz_targets/sql_server_table_desc_proto_roundtrip.rs b/src/sql-server-util/fuzz/fuzz_targets/sql_server_table_desc_proto_roundtrip.rs new file mode 100644 index 0000000000000..dd7b74e849a2f --- /dev/null +++ b/src/sql-server-util/fuzz/fuzz_targets/sql_server_table_desc_proto_roundtrip.rs @@ -0,0 +1,263 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `ProtoSqlServerTableDesc` <-> `SqlServerTableDesc` round-trip. +//! Describes external-database schemas, so a decoder bug here is reachable +//! from a compromised upstream SQL Server or on-disk catalog bytes. +//! +//! Input generation is split across four arms keyed off the first input +//! byte so a single byte stream exercises all of them over time: +//! +//! 1. **Valid-value arm.** A 32-byte seed (drawn from the input) drives +//! proptest's `Arbitrary for SqlServerTableDesc` to build a *structurally +//! valid, deeply-populated* descriptor. Non-empty columns with real +//! `SqlColumnType`s, every `SqlServerColumnDecodeType` variant (including +//! `Unsupported { context }`), `primary_key_constraint`, and populated +//! `SqlServerTableConstraint`s. It asserts the canonical +//! `from_proto(into_proto(v)) == v` Rust round-trip, which a +//! random-bytes-only target almost never reaches (random protobuf +//! decodes to near-empty messages). +//! +//! 2. **Constraint-string arm.** Drives the *raw-ingest* path +//! `SqlServerTableConstraint::try_from(SqlServerTableConstraintRaw)`, +//! which parses the `constraint_type` *string* (`"PRIMARY KEY"` / +//! `"UNIQUE"` are accepted, everything else is rejected). It feeds both +//! the two valid strings and fuzzer-controlled garbage, and proto +//! round-trips any constraint that parses. This covers the +//! string-validation boundary that the proto oneof never sees. +//! +//! 3. **Decode-type arm.** Builds a `SqlServerColumnRaw` from a real SQL +//! Server type name (`bit`, `tinyint`, `uniqueidentifier`, `xml`, +//! `datetime2`, ...) covering every supported `SqlServerColumnDecodeType` +//! plus an unsupported sentinel, runs the product `SqlServerColumnDesc::new` +//! type-mapping logic, assembles a full table desc, and proto round-trips +//! it. This reaches the `parse_data_type` mapping that the catalog format +//! is the persisted output of. +//! +//! 4. **Raw-bytes arm.** Decode arbitrary bytes and, if they happen to form a +//! valid descriptor, check the proto round-trip is stable. This guards +//! robustness against the real wire/catalog format. + +#![no_main] + +use std::sync::Arc; + +use libfuzzer_sys::fuzz_target; +use mz_proto::{ProtoType, RustType}; +use mz_sql_server_util::desc::{ + SqlServerColumnRaw, SqlServerTableConstraint, SqlServerTableConstraintRaw, SqlServerTableDesc, +}; +use mz_sql_server_util::{ProtoSqlServerColumnDesc, ProtoSqlServerTableDesc}; +use proptest::strategy::{Strategy, ValueTree}; +use proptest::test_runner::{Config, RngAlgorithm, TestRng, TestRunner}; +use prost::Message; + +/// Real SQL Server data-type spellings, chosen to exercise every branch of the +/// product `parse_data_type` mapping and therefore every supported +/// `SqlServerColumnDecodeType`. The trailing entries deliberately steer into +/// the unsupported / error path. +const DATA_TYPES: &[&str] = &[ + "bit", // Bool + "tinyint", // U8 + "smallint", // I16 + "int", // I32 + "bigint", // I64 + "real", // F32 (precision <= 24) + "float", // F64 + "char", // String + "varchar", // String + "nvarchar", // String + "text", // String + "json", // String + "varbinary", // Bytes + "binary", // Bytes + "image", // Bytes + "uniqueidentifier", // Uuid + "decimal", // Numeric + "numeric", // Numeric + "money", // Numeric + "xml", // Xml + "date", // NaiveDate + "time", // NaiveTime + "datetime2", // NaiveDateTime + "datetimeoffset", // DateTime + "sql_variant", // Unsupported + "geography", // Unsupported + "totally_bogus", // Unsupported +]; + +/// Constraint-type strings: the two the product accepts, plus garbage that +/// `SqlServerTableConstraint::try_from` must reject. +const CONSTRAINT_TYPES: &[&str] = &[ + "PRIMARY KEY", + "UNIQUE", + "primary key", // wrong case -> rejected + "FOREIGN KEY", // unsupported -> rejected + "CHECK", // unsupported -> rejected + "", // empty -> rejected + "PRIMARY KEY ", // trailing space -> rejected + "\u{1f600}junk", // non-ascii garbage -> rejected +]; + +/// Assert that a `SqlServerTableDesc` survives a full Rust round-trip through +/// its proto representation unchanged, including a re-encode/decode of the +/// wire bytes. +fn assert_rust_roundtrip(orig: &SqlServerTableDesc) { + let proto = orig.into_proto(); + let bytes = proto.encode_to_vec(); + let proto2 = ProtoSqlServerTableDesc::decode(bytes.as_slice()) + .expect("re-encode of valid SqlServerTableDesc must decode"); + let round: SqlServerTableDesc = proto2 + .into_rust() + .expect("re-encoded SqlServerTableDesc must convert back to Rust"); + assert_eq!( + orig, &round, + "SqlServerTableDesc changed across proto roundtrip" + ); +} + +/// Decode `bytes` as a proto, and if it is a valid descriptor, assert the +/// proto round-trip is stable. Used by the raw-bytes arm. +fn check_decoded(bytes: &[u8]) { + let Ok(proto) = ProtoSqlServerTableDesc::decode(bytes) else { + return; + }; + let orig: SqlServerTableDesc = match proto.into_rust() { + Ok(v) => v, + Err(_) => return, + }; + assert_rust_roundtrip(&orig); +} + +/// Build a `SqlServerColumnRaw` from the fuzzer bytes, picking a real type name +/// so the product type-mapping logic runs end-to-end. +fn craft_column(data: &[u8], idx: usize) -> SqlServerColumnRaw { + let pick = |off: usize| data.get(off).copied().unwrap_or(idx as u8); + let data_type = DATA_TYPES[pick(0) as usize % DATA_TYPES.len()]; + // For the LOB types `text`/`ntext`/`image`, SQL Server's `sys.columns` + // invariably reports `max_length = 16` (the size of the in-row root + // pointer, not the data length), and `SqlServerColumnDesc::new` soft-asserts + // exactly that. That assertion is a correct developer tripwire for a real + // upstream invariant, so we must feed these types the length a live SQL + // Server would actually report rather than synthesizing one it never + // could, otherwise we trip the assertion on structurally-impossible input. + // Every other type legitimately carries a range of lengths, so keep + // fuzzing those across -1 (max), 16, and assorted small/arbitrary values. + let max_length = if matches!(data_type, "text" | "ntext" | "image") { + 16 + } else { + match pick(2) % 4 { + 0 => -1, + 1 => 16, + 2 => i16::from(pick(3)), + _ => i16::from_le_bytes([pick(3), pick(4)]), + } + }; + SqlServerColumnRaw { + name: format!("col{idx}").into(), + data_type: data_type.into(), + is_nullable: pick(1) & 1 == 0, + max_length, + precision: pick(5) % 39, + scale: pick(6) % 39, + is_computed: pick(7) & 1 == 0, + } +} + +fuzz_target!(|data: &[u8]| { + // Reserve the first byte as a mode selector and the next 32 bytes as the + // proptest seed. Everything after that feeds the raw-bytes / crafting + // logic so a single input can drive any arm. + let mode = data.first().copied().unwrap_or(0); + let mut seed = [0u8; 32]; + let seed_src = data.get(1..33).unwrap_or(&[]); + seed[..seed_src.len()].copy_from_slice(seed_src); + let rest = data.get(33..).unwrap_or(&[]); + + match mode % 4 { + 0 => { + // Valid-value arm: drive proptest's Arbitrary from the seed. + let mut runner = TestRunner::new_with_rng( + Config::default(), + TestRng::from_seed(RngAlgorithm::ChaCha, &seed), + ); + let value = match ::arbitrary() + .new_tree(&mut runner) + { + Ok(tree) => tree.current(), + Err(_) => return, + }; + assert_rust_roundtrip(&value); + } + 1 => { + // Constraint-string arm: exercise the raw-ingest string parser for + // both accepted and rejected `constraint_type` spellings. + let ty_idx = rest.first().copied().unwrap_or(0) as usize % CONSTRAINT_TYPES.len(); + let n_cols = (rest.get(1).copied().unwrap_or(0) % 4) as usize; + let columns: Vec = (0..n_cols).map(|i| format!("c{i}")).collect(); + let raw = SqlServerTableConstraintRaw { + constraint_name: "fuzz_constraint".to_string(), + constraint_type: CONSTRAINT_TYPES[ty_idx].to_string(), + columns, + }; + // Garbage strings must be rejected. Valid ones must parse and then + // survive a proto round-trip inside a table desc. + let Ok(constraint) = SqlServerTableConstraint::try_from(raw) else { + return; + }; + let desc = SqlServerTableDesc { + schema_name: "dbo".into(), + name: "fuzz".into(), + columns: Box::new([]), + constraints: vec![constraint], + }; + assert_rust_roundtrip(&desc); + } + 2 => { + // Decode-type arm: run the product type-mapping over real type + // spellings and round-trip the resulting columns. + let n_cols = 1 + (rest.first().copied().unwrap_or(0) % 6) as usize; + let mut columns = Vec::with_capacity(n_cols); + for i in 0..n_cols { + // Give each column a distinct 8-byte window of the input. + let off = 1 + i * 8; + let window = rest.get(off..).unwrap_or(&[]); + let raw = craft_column(window, i); + let mut desc = mz_sql_server_util::desc::SqlServerColumnDesc::new(&raw); + // Occasionally populate the deprecated PK-constraint field so + // the `Option>` round-trip is covered too. + if rest.get(off).copied().unwrap_or(0) & 0x80 != 0 { + desc.primary_key_constraint = Some(Arc::from("pk_fuzz")); + } + columns.push(desc); + } + let desc = SqlServerTableDesc { + schema_name: "dbo".into(), + name: "fuzz".into(), + columns: columns.into_boxed_slice(), + constraints: vec![], + }; + assert_rust_roundtrip(&desc); + + // Also assert the per-column proto leaf round-trips independently, + // which isolates the `decode_type` oneof + `column_type` mapping. + for col in desc.columns.iter() { + let proto: ProtoSqlServerColumnDesc = col.into_proto(); + let back: mz_sql_server_util::desc::SqlServerColumnDesc = proto + .into_rust() + .expect("column desc must convert back to Rust"); + assert_eq!(col, &back, "SqlServerColumnDesc changed across roundtrip"); + } + } + _ => { + // Raw-bytes arm: decode arbitrary bytes directly. + check_decoded(rest); + } + } +}); diff --git a/src/storage-types/fuzz/.gitignore b/src/storage-types/fuzz/.gitignore new file mode 100644 index 0000000000000..ab0eaa1a49031 --- /dev/null +++ b/src/storage-types/fuzz/.gitignore @@ -0,0 +1,5 @@ +target/ +corpus/ +artifacts/ +coverage/ +Cargo.lock diff --git a/src/storage-types/fuzz/Cargo.toml b/src/storage-types/fuzz/Cargo.toml new file mode 100644 index 0000000000000..648560a48b748 --- /dev/null +++ b/src/storage-types/fuzz/Cargo.toml @@ -0,0 +1,73 @@ +# Fuzz crate for mz-storage-types proto round-trip properties. Covers the +# top-level wire types that travel between storage controller and clusterd +# (source descriptors and dataflow errors). A decoder bug here is a crash +# or corruption risk for the storage path. +# +# Excluded from the main workspace because libFuzzer requires nightly Rust. +# Run via the repo-wide runner: `bin/ci-builder run nightly ci/test/cargo-fuzz.sh`, +# or locally: +# cd src/storage-types/fuzz +# cargo +nightly fuzz run source_data_proto_roundtrip -- -max_total_time=60 + +[package] +workspace = "../../../test/cargo-fuzz" +name = "mz-storage-types-fuzz" +version = "0.0.0" +publish = false +edition = "2021" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" +mz-storage-types = { path = "..", features = ["proptest"] } +mz-proto = { path = "../../proto" } +mz-repr = { path = "../../repr", features = ["proptest"] } +# The `proptest` feature on these source-descriptor crates activates the +# `Arbitrary` derives used by the structured arm of +# source_export_statement_details_proto_roundtrip. (postgres-util gates them +# behind its `schemas` feature, which is on by default.) +mz-mysql-util = { path = "../../mysql-util", features = ["proptest"] } +mz-postgres-util = { path = "../../postgres-util" } +mz-sql-server-util = { path = "../../sql-server-util", features = ["proptest"] } +prost = "0.14.3" +proptest = "1" + +[[bin]] +name = "csv_decode" +path = "fuzz_targets/csv_decode.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "source_data_proto_roundtrip" +path = "fuzz_targets/source_data_proto_roundtrip.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "dataflow_error_proto_roundtrip" +path = "fuzz_targets/dataflow_error_proto_roundtrip.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "postgres_publication_details_proto_roundtrip" +path = "fuzz_targets/postgres_publication_details_proto_roundtrip.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "source_export_statement_details_proto_roundtrip" +path = "fuzz_targets/source_export_statement_details_proto_roundtrip.rs" +test = false +doc = false +bench = false + +# The fuzz crate has its own `[workspace]` so it must duplicate the root's +# `[patch.crates-io]`. Keep in sync with the root `Cargo.toml`. diff --git a/src/storage-types/fuzz/fuzz_targets/csv_decode.rs b/src/storage-types/fuzz/fuzz_targets/csv_decode.rs new file mode 100644 index 0000000000000..7e5526a453bc6 --- /dev/null +++ b/src/storage-types/fuzz/fuzz_targets/csv_decode.rs @@ -0,0 +1,61 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `CsvDecoderState` decodes untrusted CSV bytes (a source object's +//! contents) into `Row`s. This is the `FORMAT CSV` source decoder, the first to +//! touch external data. A panic reachable from the bytes is a source-ingestion +//! availability bug. Decoding must only ever return `Ok`/`Err`. +//! +//! The first two bytes pick the decoder config (column count 1..=4, the field +//! delimiter, and whether the first row is a validated header). The rest is fed +//! as the CSV stream, draining it through `decode` exactly as the storage decode +//! operator does. This exercises the buffer-growth (`OutputFull`/`OutputEndsFull`), +//! column-count-mismatch, invalid-UTF-8, and header-validation paths. + +#![no_main] + +use libfuzzer_sys::fuzz_target; +use mz_storage_types::sources::encoding::{ColumnSpec, CsvDecoderState, CsvEncoding}; + +fuzz_target!(|data: &[u8]| { + // First two bytes are the config, the remainder is the CSV stream. + if data.len() < 2 { + return; + } + let (cfg, mut chunk) = data.split_at(2); + let n_cols = usize::from(cfg[0] % 4) + 1; + // Usually the standard comma, but sometimes an arbitrary delimiter byte + // (which csv_core accepts) to reach unusual framing. + let delimiter = if cfg[1] & 1 == 0 { b',' } else { cfg[1] }; + let columns = if cfg[1] & 2 != 0 { + ColumnSpec::Header { + names: (0..n_cols).map(|i| format!("c{i}")).collect(), + } + } else { + ColumnSpec::Count(n_cols) + }; + + let mut state = CsvDecoderState::new(CsvEncoding { columns, delimiter }); + + // Drain the stream a record at a time, like the decode operator. Each call + // returns one decoded record (or an error for a malformed one, having + // consumed it) until the input is exhausted (`Ok(None)`). The progress guard + // is belt-and-suspenders against a non-advancing call. + loop { + let before = chunk.len(); + match state.decode(&mut chunk) { + Ok(None) => break, + Ok(Some(_)) | Err(_) => { + if chunk.len() == before { + break; + } + } + } + } +}); diff --git a/src/storage-types/fuzz/fuzz_targets/dataflow_error_proto_roundtrip.rs b/src/storage-types/fuzz/fuzz_targets/dataflow_error_proto_roundtrip.rs new file mode 100644 index 0000000000000..8606af95bfcea --- /dev/null +++ b/src/storage-types/fuzz/fuzz_targets/dataflow_error_proto_roundtrip.rs @@ -0,0 +1,86 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: `DataflowError`s must survive a proto encode + decode round trip +//! losslessly. `DataflowError`s travel between the storage controller and +//! clusterd, so a decoder bug here is a crash/poisoning risk for the storage +//! path. +//! +//! Two complementary input arms (the first byte picks): +//! +//! * **Structured arm.** Drives `DataflowError`'s proptest `Arbitrary` from the +//! libFuzzer byte stream to synthesize a *valid, deeply-populated* value +//! (the 4-arm oneof, including the nested `EvalError` / `SourceError` / +//! `EnvelopeError` / `DecodeError` trees). Random proto bytes almost never +//! reach these inner variants, so this is where the interesting branches of +//! `into_proto`/`from_proto` actually get covered. We assert the full +//! `Rust -> Proto -> Rust` round trip is the identity. +//! * **Raw-bytes arm.** Decodes arbitrary bytes straight into +//! `ProtoDataflowError`, exercising the decoder against malformed/adversarial +//! wire input, then re-encodes the recovered value. + +#![no_main] + +use libfuzzer_sys::fuzz_target; +use mz_proto::ProtoType; +use mz_storage_types::errors::{DataflowError, ProtoDataflowError}; +use prost::Message; +use proptest::strategy::{Strategy, ValueTree}; +use proptest::test_runner::{Config, RngAlgorithm, TestRng, TestRunner}; + +/// Build a 32-byte proptest seed from `bytes` (zero-padded / truncated). +fn seed_from(bytes: &[u8]) -> [u8; 32] { + let mut seed = [0u8; 32]; + let n = bytes.len().min(32); + seed[..n].copy_from_slice(&bytes[..n]); + seed +} + +/// `Rust -> Proto -> Rust` must be the identity for any valid `DataflowError`. +fn assert_roundtrip(orig: DataflowError) { + let proto = >::from_rust(&orig); + let bytes = proto.encode_to_vec(); + let proto2 = ProtoDataflowError::decode(bytes.as_slice()) + .expect("re-encode of valid DataflowError must decode"); + let round: DataflowError = proto2 + .into_rust() + .expect("re-encoded DataflowError must convert back to Rust"); + assert_eq!(orig, round, "DataflowError changed across proto roundtrip"); +} + +fuzz_target!(|data: &[u8]| { + let Some((&mode, rest)) = data.split_first() else { + return; + }; + + if mode & 1 == 0 { + // Structured arm: synthesize a valid, deep value via proptest Arbitrary. + let seed = seed_from(rest); + let mut runner = TestRunner::new_with_rng( + Config::default(), + TestRng::from_seed(RngAlgorithm::ChaCha, &seed), + ); + let Ok(tree) = ::arbitrary() + .new_tree(&mut runner) + else { + return; + }; + assert_roundtrip(tree.current()); + } else { + // Raw-bytes arm: decode adversarial wire bytes, then round-trip. + let Ok(proto) = ProtoDataflowError::decode(rest) else { + return; + }; + let orig: DataflowError = match proto.into_rust() { + Ok(v) => v, + Err(_) => return, + }; + assert_roundtrip(orig); + } +}); diff --git a/src/storage-types/fuzz/fuzz_targets/postgres_publication_details_proto_roundtrip.rs b/src/storage-types/fuzz/fuzz_targets/postgres_publication_details_proto_roundtrip.rs new file mode 100644 index 0000000000000..5542db4fcffa9 --- /dev/null +++ b/src/storage-types/fuzz/fuzz_targets/postgres_publication_details_proto_roundtrip.rs @@ -0,0 +1,97 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: a `PostgresSourcePublicationDetails` must survive a proto +//! encode + decode round trip losslessly. These descriptors travel between the +//! storage controller and clusterd and persist replication state, so a decoder +//! bug here is reachable via the storage RPC and on-disk format. +//! +//! The Rust type is near-trivial (two `String`s plus an `Option`), so +//! rather than wiring up proptest we synthesize a valid value directly from the +//! input bytes. Two complementary input arms (the first byte picks): +//! +//! * **Structured arm.** Splits the remaining bytes into the `slot` and +//! `database` strings (lossy UTF-8 so arbitrary bytes always yield a valid +//! `String`) and derives `timeline_id` from a length byte, then asserts the +//! full `Rust -> Proto -> Rust` round trip is the identity. This guarantees +//! non-empty fields and a populated `Option`, which random proto bytes rarely +//! produce. +//! * **Raw-bytes arm.** Decodes arbitrary bytes straight into the proto, +//! exercising the decoder against malformed/adversarial wire input, then +//! re-encodes the recovered value. + +#![no_main] + +use libfuzzer_sys::fuzz_target; +use mz_proto::ProtoType; +use mz_storage_types::sources::postgres::{ + PostgresSourcePublicationDetails, ProtoPostgresSourcePublicationDetails, +}; +use prost::Message; + +/// `Rust -> Proto -> Rust` must be the identity for any valid value. +fn assert_roundtrip(orig: PostgresSourcePublicationDetails) { + let proto = >::from_rust(&orig); + let bytes = proto.encode_to_vec(); + let proto2 = ProtoPostgresSourcePublicationDetails::decode(bytes.as_slice()) + .expect("re-encode of valid PostgresSourcePublicationDetails must decode"); + let round: PostgresSourcePublicationDetails = proto2 + .into_rust() + .expect("re-encoded PostgresSourcePublicationDetails must convert back to Rust"); + assert_eq!( + orig, round, + "PostgresSourcePublicationDetails changed across proto roundtrip" + ); +} + +fuzz_target!(|data: &[u8]| { + let Some((&mode, rest)) = data.split_first() else { + return; + }; + + if mode & 1 == 0 { + // Structured arm: build a valid value directly from the bytes. + let mid = rest.len() / 2; + let (slot_bytes, db_bytes) = rest.split_at(mid); + let timeline_id = if mode & 2 == 0 { + None + } else { + // Pack up to 8 bytes of the input into the u64 timeline id. + let mut buf = [0u8; 8]; + let n = rest.len().min(8); + buf[..n].copy_from_slice(&rest[..n]); + Some(u64::from_le_bytes(buf)) + }; + // `None` exercises the pre-field default; bits 2 and 3 of `mode` pick + // between an absent value and a recorded `false`/`true`. + let is_physical_replica = if mode & 4 == 0 { + None + } else { + Some(mode & 8 != 0) + }; + assert_roundtrip(PostgresSourcePublicationDetails { + slot: String::from_utf8_lossy(slot_bytes).into_owned(), + timeline_id, + database: String::from_utf8_lossy(db_bytes).into_owned(), + is_physical_replica, + }); + } else { + // Raw-bytes arm: decode adversarial wire bytes, then round-trip. + let Ok(proto) = ProtoPostgresSourcePublicationDetails::decode(rest) else { + return; + }; + let orig: PostgresSourcePublicationDetails = match proto.into_rust() { + Ok(v) => v, + Err(_) => return, + }; + assert_roundtrip(orig); + } +}); diff --git a/src/storage-types/fuzz/fuzz_targets/source_data_proto_roundtrip.rs b/src/storage-types/fuzz/fuzz_targets/source_data_proto_roundtrip.rs new file mode 100644 index 0000000000000..351994c15222d --- /dev/null +++ b/src/storage-types/fuzz/fuzz_targets/source_data_proto_roundtrip.rs @@ -0,0 +1,100 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: a `SourceData` must survive a proto encode + decode round trip +//! losslessly. `SourceData` is the persisted/wire payload of every storage +//! collection, so a decoder bug here corrupts source output. +//! +//! `SourceData` wraps a `Result`. Two complementary input +//! arms (the first byte picks): +//! +//! * **Structured arm.** Drives proptest `Arbitrary` from the libFuzzer byte +//! stream to synthesize a *valid, populated* value. The `Ok` branch packs a +//! full random `Row` (every datum kind, including the tricky numeric / date / +//! array / map encodings). The `Err` branch synthesizes a deep +//! `DataflowError` (the 4-arm oneof with nested EvalError/Source/Envelope +//! trees). Random proto bytes essentially never reach a non-empty Row or a +//! deep error, so this is where `Row::into_proto`/`from_proto` and the error +//! conversions actually get exercised. We assert the full +//! `Rust -> Proto -> Rust` round trip is the identity. +//! * **Raw-bytes arm.** Decodes arbitrary bytes straight into +//! `ProtoSourceData`, exercising the decoder against malformed/adversarial +//! wire input, then re-encodes the recovered value. + +#![no_main] + +use libfuzzer_sys::fuzz_target; +use mz_proto::ProtoType; +use mz_repr::Row; +use mz_storage_types::errors::DataflowError; +use mz_storage_types::sources::{ProtoSourceData, SourceData}; +use prost::Message; +use proptest::strategy::{Strategy, ValueTree}; +use proptest::test_runner::{Config, RngAlgorithm, TestRng, TestRunner}; + +/// Build a 32-byte proptest seed from `bytes` (zero-padded / truncated). +fn seed_from(bytes: &[u8]) -> [u8; 32] { + let mut seed = [0u8; 32]; + let n = bytes.len().min(32); + seed[..n].copy_from_slice(&bytes[..n]); + seed +} + +/// `Rust -> Proto -> Rust` must be the identity for any valid `SourceData`. +fn assert_roundtrip(orig: SourceData) { + let proto = >::from_rust(&orig); + let bytes = proto.encode_to_vec(); + let proto2 = ProtoSourceData::decode(bytes.as_slice()) + .expect("re-encode of valid SourceData must decode"); + let round: SourceData = proto2 + .into_rust() + .expect("re-encoded SourceData must convert back to Rust"); + assert_eq!(orig, round, "SourceData changed across proto roundtrip"); +} + +fuzz_target!(|data: &[u8]| { + let Some((&mode, rest)) = data.split_first() else { + return; + }; + + if mode & 1 == 0 { + // Structured arm: synthesize a valid value via proptest Arbitrary. The + // low bit of `mode` selected this arm, the next bit picks Ok vs Err. + let seed = seed_from(rest); + let mut runner = TestRunner::new_with_rng( + Config::default(), + TestRng::from_seed(RngAlgorithm::ChaCha, &seed), + ); + let value = if mode & 2 == 0 { + let Ok(tree) = ::arbitrary().new_tree(&mut runner) + else { + return; + }; + SourceData(Ok(tree.current())) + } else { + let Ok(tree) = + ::arbitrary().new_tree(&mut runner) + else { + return; + }; + SourceData(Err(tree.current())) + }; + assert_roundtrip(value); + } else { + // Raw-bytes arm: decode adversarial wire bytes, then round-trip. + let Ok(proto) = ProtoSourceData::decode(rest) else { + return; + }; + let orig: SourceData = match proto.into_rust() { + Ok(v) => v, + Err(_) => return, + }; + assert_roundtrip(orig); + } +}); diff --git a/src/storage-types/fuzz/fuzz_targets/source_export_statement_details_proto_roundtrip.rs b/src/storage-types/fuzz/fuzz_targets/source_export_statement_details_proto_roundtrip.rs new file mode 100644 index 0000000000000..fb76f1a85b98f --- /dev/null +++ b/src/storage-types/fuzz/fuzz_targets/source_export_statement_details_proto_roundtrip.rs @@ -0,0 +1,166 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: a `SourceExportStatementDetails` must survive a proto encode + +//! decode round trip losslessly. The Rust side is a 5-variant enum +//! (Postgres / MySql / SqlServer / LoadGenerator / Kafka), so the conversion +//! has plenty of branches that need to round-trip. This value is serialized to +//! the catalog, so a decoder bug here is a corruption/migration risk. +//! +//! Two complementary input arms (the first byte picks): +//! +//! * **Structured arm.** Synthesizes a *valid, populated* value. The +//! Postgres / MySql / SqlServer variants carry a full table descriptor, which +//! we generate with each `*TableDesc`'s proptest `Arbitrary` (driven from the +//! libFuzzer byte stream). The load-generator output and the empty Kafka +//! variant are picked directly. Random proto bytes essentially never produce +//! a non-empty table descriptor, so this is where the nested column / +//! constraint / key conversions actually get exercised. +//! * **Raw-bytes arm.** Decodes arbitrary bytes straight into the proto, +//! exercising the decoder against malformed/adversarial wire input (including +//! the SQL Server `Lsn` `try_from` length guard, which is only reachable from +//! raw bytes since a re-encoded `Lsn` is always exactly 10 bytes). +//! +//! `SourceExportStatementDetails` doesn't derive `PartialEq`/`Debug`, so +//! losslessness is asserted by comparing the canonical re-encoded bytes from +//! two successive `Rust -> Proto` round trips. + +#![no_main] + +use libfuzzer_sys::fuzz_target; +use mz_mysql_util::MySqlTableDesc; +use mz_postgres_util::desc::PostgresTableDesc; +use mz_proto::ProtoType; +use mz_sql_server_util::desc::SqlServerTableDesc; +use mz_storage_types::sources::load_generator::LoadGeneratorOutput; +use mz_storage_types::sources::{ProtoSourceExportStatementDetails, SourceExportStatementDetails}; +use prost::Message; +use proptest::strategy::{Strategy, ValueTree}; +use proptest::test_runner::{Config, RngAlgorithm, TestRng, TestRunner}; + +/// Build a 32-byte proptest seed from `bytes` (zero-padded / truncated). +fn seed_from(bytes: &[u8]) -> [u8; 32] { + let mut seed = [0u8; 32]; + let n = bytes.len().min(32); + seed[..n].copy_from_slice(&bytes[..n]); + seed +} + +/// Generate one value of `T` via its proptest `Arbitrary`, or `None` if the +/// strategy fails. +fn arb(runner: &mut TestRunner) -> Option { + T::arbitrary().new_tree(runner).ok().map(|t| t.current()) +} + +/// The canonical proto encoding of `details`. +fn encode(details: &SourceExportStatementDetails) -> Vec { + >::from_rust( + details, + ) + .encode_to_vec() +} + +/// `Rust -> Proto -> Rust -> Proto` must reproduce the same canonical bytes. +fn assert_roundtrip(orig: SourceExportStatementDetails) { + let canonical = encode(&orig); + let reparsed = ProtoSourceExportStatementDetails::decode(canonical.as_slice()) + .expect("re-encode of valid SourceExportStatementDetails must decode"); + let round: SourceExportStatementDetails = reparsed + .into_rust() + .expect("re-encoded SourceExportStatementDetails must convert back to Rust"); + assert_eq!( + canonical, + encode(&round), + "SourceExportStatementDetails changed across proto roundtrip" + ); +} + +fuzz_target!(|data: &[u8]| { + let Some((&mode, rest)) = data.split_first() else { + return; + }; + + if mode & 1 == 0 { + // Structured arm: synthesize a valid value. Upper bits of `mode` select + // which of the 5 variants to build. + let seed = seed_from(rest); + let mut runner = TestRunner::new_with_rng( + Config::default(), + TestRng::from_seed(RngAlgorithm::ChaCha, &seed), + ); + let value = match (mode >> 1) % 5 { + 0 => { + let Some(table) = arb::(&mut runner) else { + return; + }; + SourceExportStatementDetails::Postgres { table } + } + 1 => { + let Some(table) = arb::(&mut runner) else { + return; + }; + let Some(initial_gtid_set) = arb::(&mut runner) else { + return; + }; + let Some(binlog_full_metadata) = arb::(&mut runner) else { + return; + }; + SourceExportStatementDetails::MySql { + table, + initial_gtid_set, + binlog_full_metadata, + } + } + 2 => { + let Some(table) = arb::(&mut runner) else { + return; + }; + let Some(capture_instance) = arb::(&mut runner) else { + return; + }; + let Some(initial_lsn) = arb::(&mut runner) else { + return; + }; + SourceExportStatementDetails::SqlServer { + table, + capture_instance: capture_instance.into(), + initial_lsn, + } + } + 3 => { + // Cover every `LoadGeneratorOutput` discriminant. + let output = match rest.first().copied().unwrap_or(0) % 4 { + 0 => LoadGeneratorOutput::Default, + 1 => LoadGeneratorOutput::Auction( + mz_storage_types::sources::load_generator::AuctionView::Bids, + ), + 2 => LoadGeneratorOutput::Marketing( + mz_storage_types::sources::load_generator::MarketingView::Leads, + ), + _ => LoadGeneratorOutput::Tpch( + mz_storage_types::sources::load_generator::TpchView::Customer, + ), + }; + SourceExportStatementDetails::LoadGenerator { output } + } + _ => SourceExportStatementDetails::Kafka {}, + }; + assert_roundtrip(value); + } else { + // Raw-bytes arm: decode adversarial wire bytes, then round-trip. + let Ok(proto) = ProtoSourceExportStatementDetails::decode(rest) else { + return; + }; + let orig: SourceExportStatementDetails = match proto.into_rust() { + Ok(v) => v, + Err(_) => return, + }; + assert_roundtrip(orig); + } +}); diff --git a/src/storage/Cargo.toml b/src/storage/Cargo.toml index cff7944e80d48..4baba1c8accae 100644 --- a/src/storage/Cargo.toml +++ b/src/storage/Cargo.toml @@ -109,6 +109,9 @@ tokio.workspace = true [features] default = [] +# Exposes the in-memory upsert backend and `FuzzUpsertParts` so the fuzz crate +# can drive the upsert state machine. Not enabled in production builds. +fuzzing = [] [package.metadata.cargo-udeps.ignore] # only used on linux diff --git a/src/storage/fuzz/.gitignore b/src/storage/fuzz/.gitignore new file mode 100644 index 0000000000000..ab0eaa1a49031 --- /dev/null +++ b/src/storage/fuzz/.gitignore @@ -0,0 +1,5 @@ +target/ +corpus/ +artifacts/ +coverage/ +Cargo.lock diff --git a/src/storage/fuzz/Cargo.toml b/src/storage/fuzz/Cargo.toml new file mode 100644 index 0000000000000..a5fb11d2f4116 --- /dev/null +++ b/src/storage/fuzz/Cargo.toml @@ -0,0 +1,53 @@ +# Fuzz crate for mz-storage: exercise the upsert state machine's value +# consolidation/encoding over untrusted source values. Excluded from the main +# workspace because libFuzzer requires nightly Rust. + +[package] +workspace = "../../../test/cargo-fuzz" +name = "mz-storage-fuzz" +version = "0.0.0" +publish = false +edition = "2021" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" +mz-storage = { path = "..", features = ["fuzzing"] } +mz-repr = { path = "../../repr" } +mz-storage-types = { path = "../../storage-types" } +mz-row-spine = { path = "../../row-spine" } +differential-dataflow = "0.24.0" +timely = "0.30.0" +tokio = { version = "1", features = ["rt"] } +chrono = { version = "0.4.39", default-features = false, features = ["clock", "serde", "std"] } +uuid = "1.19.0" + +[[bin]] +name = "upsert_consolidate" +path = "fuzz_targets/upsert_consolidate.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "upsert_value_roundtrip_v2" +path = "fuzz_targets/upsert_value_roundtrip_v2.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "upsert_state_consolidate" +path = "fuzz_targets/upsert_state_consolidate.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "upsert_runtime" +path = "fuzz_targets/upsert_runtime.rs" +test = false +doc = false +bench = false diff --git a/src/storage/fuzz/fuzz_targets/upsert_consolidate.rs b/src/storage/fuzz/fuzz_targets/upsert_consolidate.rs new file mode 100644 index 0000000000000..fcaf5bc142556 --- /dev/null +++ b/src/storage/fuzz/fuzz_targets/upsert_consolidate.rs @@ -0,0 +1,350 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: the upsert state machine's value consolidation +//! (`StateValue::merge_update` + `ensure_decoded`). Upsert collapses a key's +//! `(value, diff)` history into its current value using a XOR/length/checksum +//! accumulator (so it never has to keep the unconsolidated history in memory), +//! then `ensure_decoded` `bincode`-decodes the accumulated `value_xor` back into +//! a `Row`. The *values* come from untrusted source data, so this exercises that +//! accumulate-then-decode machinery against arbitrary `Row`s. +//! +//! The *diffs*, by contrast, are Materialize-controlled: a well-formed upsert +//! collection has, per key, zero-or-more canceling `(prev, +1) (prev, -1)` pairs +//! and at most one `(cur, +1)`. We only generate that shape (so `diff_sum` stays +//! in `{0, 1}` and we never trip the intentional "invalid upsert state" invariant +//! that guards against impossible diff sums), shuffle the updates, merge them, +//! and check the consolidated result: +//! +//! * with a current value, it must decode back to exactly that value (the +//! canceling pairs must XOR/sum away regardless of order or of colliding +//! with each other or the current value). +//! * with no current value, the key must consolidate to absent (a tombstone). +//! +//! A panic in `merge_update`/`ensure_decoded`, or a mismatch, is a finding. +//! +//! The values (`prev`s and the `cur`) are drawn from the tricky-encoding datum +//! space (`Numeric`, `Date`, `Timestamp`/`TimestampTz`, `Interval`, `Uuid`, +//! `MzTimestamp`, `Bytes`, long strings, `MzAclItem`, and the nested composites +//! `Array`/`List`/`Map`/`Range`), and a fraction are `Err(UpsertError)` values +//! (real upsert sources stage errored values alongside `Ok` rows). This stresses +//! the XOR/len/checksum accumulator over bincode payloads of widely varying +//! length, which is exactly what the accumulator's resize/zero-extend logic is +//! sensitive to. + +#![no_main] + +use libfuzzer_sys::arbitrary::{self, Arbitrary, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_repr::adt::array::ArrayDimension; +use mz_repr::adt::date::Date; +use mz_repr::adt::interval::Interval; +use mz_repr::adt::mz_acl_item::{AclMode, MzAclItem}; +use mz_repr::adt::numeric::Numeric; +use mz_repr::adt::range::{Range, RangeBound, RangeInner}; +use mz_repr::adt::timestamp::CheckedTimestamp; +use mz_repr::role_id::RoleId; +use mz_repr::{Datum, Diff, GlobalId, Row, RowPacker, Timestamp}; +use mz_storage::fuzz_exports::{StateValue, UpsertValue, upsert_bincode_opts}; +use mz_storage_types::errors::{ + DecodeError, DecodeErrorKind, UpsertError, UpsertNullKeyError, UpsertValueError, +}; + +use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Utc}; + +/// An arbitrary in-range `NaiveDateTime`, or `None` if out of the supported range. +fn gen_naive_dt(u: &mut Unstructured) -> arbitrary::Result> { + let year = u.int_in_range(1i32..=9999)?; + let ord = u.int_in_range(1u32..=365)?; + let secs = u.int_in_range(0u32..=86_399)?; + let nanos = u.int_in_range(0u32..=999_999_999)?; + let date = NaiveDate::from_yo_opt(year, ord); + let time = NaiveTime::from_num_seconds_from_midnight_opt(secs, nanos); + Ok(match (date, time) { + (Some(d), Some(t)) => Some(NaiveDateTime::new(d, t)), + _ => None, + }) +} + +fn gen_role_id(u: &mut Unstructured) -> arbitrary::Result { + Ok(match u.int_in_range(0u8..=2)? { + 0 => RoleId::User(u64::arbitrary(u)?), + 1 => RoleId::System(u64::arbitrary(u)?), + _ => RoleId::Public, + }) +} + +/// Push a single arbitrary scalar (non-composite) datum from the tricky space. +fn push_scalar(packer: &mut RowPacker, u: &mut Unstructured) -> arbitrary::Result<()> { + match u.int_in_range(0u8..=14)? { + 0 => packer.push(Datum::Null), + 1 => packer.push(Datum::Int32(i32::arbitrary(u)?)), + 2 => packer.push(Datum::Int64(i64::arbitrary(u)?)), + 3 => packer.push(Datum::UInt8(u8::arbitrary(u)?)), + 4 => packer.push(if bool::arbitrary(u)? { + Datum::True + } else { + Datum::False + }), + 5 => packer.push(Datum::from(Numeric::from(i64::arbitrary(u)?))), + 6 => { + let days = u.int_in_range(Date::LOW_DAYS..=Date::HIGH_DAYS)?; + packer.push(Datum::Date(Date::from_pg_epoch(days).unwrap())); + } + 7 => { + if let Some(dt) = gen_naive_dt(u)? { + packer.push(Datum::Timestamp( + CheckedTimestamp::from_timestamplike(dt).unwrap(), + )); + } else { + packer.push(Datum::Null); + } + } + 8 => { + if let Some(dt) = gen_naive_dt(u)? { + let utc = DateTime::::from_naive_utc_and_offset(dt, Utc); + packer.push(Datum::TimestampTz( + CheckedTimestamp::from_timestamplike(utc).unwrap(), + )); + } else { + packer.push(Datum::Null); + } + } + 9 => packer.push(Datum::Interval(Interval::new( + i32::arbitrary(u)?, + i32::arbitrary(u)?, + i64::arbitrary(u)?, + ))), + 10 => packer.push(Datum::Uuid(uuid::Uuid::from_bytes( + <[u8; 16]>::arbitrary(u)?, + ))), + 11 => packer.push(Datum::MzTimestamp(Timestamp::from(u64::arbitrary(u)?))), + 12 => { + let len = u.int_in_range(0usize..=20)?; + let bytes = u.bytes(len)?; + packer.push(Datum::Bytes(bytes)); + } + 13 => packer.push(Datum::MzAclItem(MzAclItem { + grantee: gen_role_id(u)?, + grantor: gen_role_id(u)?, + acl_mode: AclMode::from_bits_truncate(u64::arbitrary(u)?), + })), + _ => { + // String length that may spill past the 1-byte tiny encoding. + let len = u.int_in_range(0usize..=300)?; + let mut s = String::with_capacity(len); + for _ in 0..len { + s.push(if bool::arbitrary(u)? { 'a' } else { 'é' }); + } + packer.push(Datum::String(&s)); + } + } + Ok(()) +} + +/// A vector of `'static`-safe scalar datums for use as composite elements. +fn gen_scalar_vec(u: &mut Unstructured, n: usize) -> arbitrary::Result>> { + let mut out = Vec::with_capacity(n); + for _ in 0..n { + out.push(match u.int_in_range(0u8..=7)? { + 0 => Datum::Null, + 1 => Datum::Int32(i32::arbitrary(u)?), + 2 => Datum::Int64(i64::arbitrary(u)?), + 3 => Datum::UInt8(u8::arbitrary(u)?), + 4 => Datum::from(Numeric::from(i64::arbitrary(u)?)), + 5 => Datum::Interval(Interval::new( + i32::arbitrary(u)?, + i32::arbitrary(u)?, + i64::arbitrary(u)?, + )), + 6 => Datum::Uuid(uuid::Uuid::from_bytes(<[u8; 16]>::arbitrary(u)?)), + _ => Datum::MzTimestamp(Timestamp::from(u64::arbitrary(u)?)), + }); + } + Ok(out) +} + +/// Push a `Range` over `Int32` bounds (possibly empty or with infinite bounds). +fn push_range(packer: &mut RowPacker, u: &mut Unstructured) -> arbitrary::Result<()> { + if bool::arbitrary(u)? { + packer + .push_range(Range { inner: None }) + .expect("empty range is valid"); + return Ok(()); + } + let lo = i32::arbitrary(u)?; + let hi = i32::arbitrary(u)?; + let (lo, hi) = if lo <= hi { (lo, hi) } else { (hi, lo) }; + let lower = RangeBound { + inclusive: bool::arbitrary(u)?, + bound: if bool::arbitrary(u)? { + Some(Datum::Int32(lo)) + } else { + None + }, + }; + let upper = RangeBound { + inclusive: bool::arbitrary(u)?, + bound: if bool::arbitrary(u)? { + Some(Datum::Int32(hi)) + } else { + None + }, + }; + let range = Range { + inner: Some(RangeInner { lower, upper }), + }; + if packer.push_range(range).is_err() { + packer.push(Datum::Int32(lo)); + } + Ok(()) +} + +/// Push one datum: usually a scalar, occasionally a nested composite whose +/// elements are themselves scalars. Always produces a valid `Row`. +fn push_datum(packer: &mut RowPacker, u: &mut Unstructured) -> arbitrary::Result<()> { + match u.int_in_range(0u8..=9)? { + 0..=5 => push_scalar(packer, u), + 6 => { + let n = u.int_in_range(0usize..=4)?; + let datums = gen_scalar_vec(u, n)?; + let dims = [ArrayDimension { + lower_bound: 1, + length: datums.len(), + }]; + packer + .try_push_array(&dims, datums.iter()) + .expect("single-dimension array is always valid"); + Ok(()) + } + 7 => { + let n = u.int_in_range(0usize..=4)?; + let datums = gen_scalar_vec(u, n)?; + packer.push_list(datums.iter()); + Ok(()) + } + 8 => { + let n = u.int_in_range(0usize..=4)?; + let mut keys: Vec = Vec::with_capacity(n); + for _ in 0..n { + let len = u.int_in_range(0usize..=4)?; + let mut s = String::with_capacity(len); + for _ in 0..len { + s.push(if bool::arbitrary(u)? { 'a' } else { 'b' }); + } + keys.push(s); + } + keys.sort(); + keys.dedup(); + let vals = gen_scalar_vec(u, keys.len())?; + packer.push_dict(keys.iter().map(String::as_str).zip(vals.iter().copied())); + Ok(()) + } + _ => push_range(packer, u), + } +} + +/// A small arbitrary `Row` (the untrusted source value), covering the tricky +/// scalar and nested-composite datum space. +fn gen_row(u: &mut Unstructured) -> arbitrary::Result { + let n = u.int_in_range(0usize..=5)?; + let mut row = Row::default(); + let mut packer = row.packer(); + for _ in 0..n { + push_datum(&mut packer, u)?; + } + drop(packer); + Ok(row) +} + +/// An arbitrary `DecodeError` payload for the error-arm variants. +fn gen_decode_error(u: &mut Unstructured) -> arbitrary::Result { + let msg = String::arbitrary(u)?; + let kind = if bool::arbitrary(u)? { + DecodeErrorKind::Text(msg.into_boxed_str()) + } else { + DecodeErrorKind::Bytes(msg.into_boxed_str()) + }; + let raw = Vec::::arbitrary(u)?; + Ok(DecodeError { kind, raw }) +} + +/// An arbitrary `UpsertValue`: usually an `Ok(Row)`, occasionally an +/// `Err(UpsertError)` spanning the whole variant space (real sources stage +/// errored values alongside good rows). +fn gen_value(u: &mut Unstructured) -> arbitrary::Result { + if u.ratio(1u8, 5u8)? { + let err = match u.int_in_range(0u8..=2)? { + 0 => UpsertError::KeyDecode(gen_decode_error(u)?), + 1 => UpsertError::Value(UpsertValueError { + inner: gen_decode_error(u)?, + for_key: gen_row(u)?, + }), + _ => UpsertError::NullKey(UpsertNullKeyError), + }; + Ok(Err(Box::new(err))) + } else { + Ok(Ok(gen_row(u)?)) + } +} + +fn run(u: &mut Unstructured) -> arbitrary::Result<()> { + let bincode_opts = upsert_bincode_opts(); + let mut bincode_buffer = Vec::new(); + + // Build a well-formed history: canceling `(prev, +1) (prev, -1)` pairs plus + // at most one current `(cur, +1)`. + let mut updates: Vec<(UpsertValue, Diff)> = Vec::new(); + let n_pairs = u.int_in_range(0usize..=6)?; + for _ in 0..n_pairs { + let prev: UpsertValue = gen_value(u)?; + updates.push((prev.clone(), Diff::ONE)); + updates.push((prev, -Diff::ONE)); + } + let current: Option = if bool::arbitrary(u)? { + Some(gen_value(u)?) + } else { + None + }; + if let Some(cur) = ¤t { + updates.push((cur.clone(), Diff::ONE)); + } + + // Shuffle (Fisher-Yates). Consolidation must be order-independent. + for i in (1..updates.len()).rev() { + let j = u.int_in_range(0..=i)?; + updates.swap(i, j); + } + + let mut state = StateValue::<(), ()>::default(); + for (value, diff) in updates { + state.merge_update(value, diff, bincode_opts, &mut bincode_buffer); + } + state.ensure_decoded(bincode_opts, GlobalId::User(0), None); + let decoded = state.into_decoded(); + + match (current, decoded.finalized) { + (Some(cur), Some(got)) => { + assert_eq!(got, cur, "consolidation decoded the wrong current value") + } + (Some(_), None) => { + panic!("consolidation dropped the current value") + } + (None, None) => {} + (None, other) => { + panic!("consolidation invented a value from canceling pairs: {other:?}") + } + } + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let mut u = Unstructured::new(data); + let _ = run(&mut u); +}); diff --git a/src/storage/fuzz/fuzz_targets/upsert_runtime.rs b/src/storage/fuzz/fuzz_targets/upsert_runtime.rs new file mode 100644 index 0000000000000..0f6d2259f1e25 --- /dev/null +++ b/src/storage/fuzz/fuzz_targets/upsert_runtime.rs @@ -0,0 +1,447 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: the upsert operator's *runtime* processing, `drain_staged_input`. +//! That is the per-timestamp `multi_get` -> order-keyed per-command processing -> +//! `multi_put` loop that turns a batch of staged `(key, value, order)` commands +//! at various timestamps into the differential retract/insert output, with +//! last-write-wins resolved by an order key (`from_time`). +//! +//! Where `upsert_consolidate` / `upsert_state_consolidate` only exercise the +//! snapshot-rehydration *consolidation* path, this drives the live runtime loop, +//! which is where the hard-to-reproduce correctness issues live: multiple +//! commands for the same `(timestamp, key)` (only the maximum-order one may +//! survive), deletes, re-inserts of the same value, and a key changing across +//! several timestamps in one drain. +//! +//! Because a single `ToUpper` drain always writes *finalized* values (so a +//! later timestamp's `provisional_order` is `None` and the order-skip never +//! fires across timestamps), the semantics reduce to something a simple, +//! obviously-correct oracle can predict: per `(ts, key)` the maximum-order +//! command wins, and processing those winners in timestamp order per key yields +//! the output (retract the prior value, insert the new one) and the final state. +//! We run the real `drain_staged_input` and assert both its emitted output +//! (consolidated) and its final per-key state match that oracle. A divergence, +//! a dropped/duplicated retraction, a wrong winner, a lost or invented value, +//! is a correctness bug. A panic is an availability bug. +//! +//! To make the interesting cancellation paths fire frequently, each key draws +//! its values from a *tiny per-key pool* (plus deletes): with only a handful of +//! distinct values, re-inserting the same value, and delete-then-reinsert of an +//! identical value, are common, exactly the retract/insert exact-cancellation +//! cases. The values themselves cover the tricky-encoding datum space +//! (`Numeric`, `Date`, `Timestamp`/`TimestampTz`, `Interval`, `Uuid`, +//! `MzTimestamp`, `Bytes`, long strings, `MzAclItem`, nested +//! `Array`/`List`/`Map`/`Range`), and a fraction of pool entries are +//! `Err(UpsertError)` values (real sources stage errored values), which the +//! oracle and output comparison handle directly as `UpsertValue`s. + +#![no_main] + +use std::collections::BTreeMap; +use std::sync::OnceLock; + +use libfuzzer_sys::arbitrary::{self, Arbitrary, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_repr::adt::array::ArrayDimension; +use mz_repr::adt::date::Date; +use mz_repr::adt::interval::Interval; +use mz_repr::adt::mz_acl_item::{AclMode, MzAclItem}; +use mz_repr::adt::numeric::Numeric; +use mz_repr::adt::range::{Range, RangeBound, RangeInner}; +use mz_repr::adt::timestamp::CheckedTimestamp; +use mz_repr::role_id::RoleId; +use mz_repr::{Datum, Diff, Row, RowPacker, Timestamp}; +use mz_storage::fuzz_exports::{FuzzUpsertParts, UpsertKey, UpsertValue, fuzz_drain_staged_input}; +use mz_storage::source::SourceExportCreationConfig; +use mz_storage_types::errors::{ + DecodeError, DecodeErrorKind, UpsertError, UpsertNullKeyError, UpsertValueError, +}; + +use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Utc}; + +// The metrics plumbing is built once per process. `FuzzUpsertParts` is `Sync` +// so it lives in a `static`. `SourceExportCreationConfig` holds a non-`Sync` +// `SourceStatistics`, so it goes in a `thread_local` (built once, not per +// iteration, because re-registering its Prometheus metrics each call was the +// throughput bottleneck). +static PARTS: OnceLock = OnceLock::new(); +thread_local! { + static CFG: SourceExportCreationConfig = PARTS.get_or_init(FuzzUpsertParts::new).source_config(); +} + +fn rt() -> &'static tokio::runtime::Runtime { + static RT: OnceLock = OnceLock::new(); + RT.get_or_init(|| { + tokio::runtime::Builder::new_current_thread() + .build() + .expect("current-thread runtime") + }) +} + +/// An arbitrary in-range `NaiveDateTime`, or `None` if out of the supported range. +fn gen_naive_dt(u: &mut Unstructured) -> arbitrary::Result> { + let year = u.int_in_range(1i32..=9999)?; + let ord = u.int_in_range(1u32..=365)?; + let secs = u.int_in_range(0u32..=86_399)?; + let nanos = u.int_in_range(0u32..=999_999_999)?; + let date = NaiveDate::from_yo_opt(year, ord); + let time = NaiveTime::from_num_seconds_from_midnight_opt(secs, nanos); + Ok(match (date, time) { + (Some(d), Some(t)) => Some(NaiveDateTime::new(d, t)), + _ => None, + }) +} + +fn gen_role_id(u: &mut Unstructured) -> arbitrary::Result { + Ok(match u.int_in_range(0u8..=2)? { + 0 => RoleId::User(u64::arbitrary(u)?), + 1 => RoleId::System(u64::arbitrary(u)?), + _ => RoleId::Public, + }) +} + +/// Push a single arbitrary scalar (non-composite) datum from the tricky space. +fn push_scalar(packer: &mut RowPacker, u: &mut Unstructured) -> arbitrary::Result<()> { + match u.int_in_range(0u8..=14)? { + 0 => packer.push(Datum::Null), + 1 => packer.push(Datum::Int32(i32::arbitrary(u)?)), + 2 => packer.push(Datum::Int64(i64::arbitrary(u)?)), + 3 => packer.push(Datum::UInt8(u8::arbitrary(u)?)), + 4 => packer.push(if bool::arbitrary(u)? { + Datum::True + } else { + Datum::False + }), + 5 => packer.push(Datum::from(Numeric::from(i64::arbitrary(u)?))), + 6 => { + let days = u.int_in_range(Date::LOW_DAYS..=Date::HIGH_DAYS)?; + packer.push(Datum::Date(Date::from_pg_epoch(days).unwrap())); + } + 7 => { + if let Some(dt) = gen_naive_dt(u)? { + packer.push(Datum::Timestamp( + CheckedTimestamp::from_timestamplike(dt).unwrap(), + )); + } else { + packer.push(Datum::Null); + } + } + 8 => { + if let Some(dt) = gen_naive_dt(u)? { + let utc = DateTime::::from_naive_utc_and_offset(dt, Utc); + packer.push(Datum::TimestampTz( + CheckedTimestamp::from_timestamplike(utc).unwrap(), + )); + } else { + packer.push(Datum::Null); + } + } + 9 => packer.push(Datum::Interval(Interval::new( + i32::arbitrary(u)?, + i32::arbitrary(u)?, + i64::arbitrary(u)?, + ))), + 10 => packer.push(Datum::Uuid(uuid::Uuid::from_bytes( + <[u8; 16]>::arbitrary(u)?, + ))), + 11 => packer.push(Datum::MzTimestamp(Timestamp::from(u64::arbitrary(u)?))), + 12 => { + let len = u.int_in_range(0usize..=20)?; + let bytes = u.bytes(len)?; + packer.push(Datum::Bytes(bytes)); + } + 13 => packer.push(Datum::MzAclItem(MzAclItem { + grantee: gen_role_id(u)?, + grantor: gen_role_id(u)?, + acl_mode: AclMode::from_bits_truncate(u64::arbitrary(u)?), + })), + _ => { + let len = u.int_in_range(0usize..=300)?; + let mut s = String::with_capacity(len); + for _ in 0..len { + s.push(if bool::arbitrary(u)? { 'a' } else { 'é' }); + } + packer.push(Datum::String(&s)); + } + } + Ok(()) +} + +/// A vector of `'static`-safe scalar datums for use as composite elements. +fn gen_scalar_vec(u: &mut Unstructured, n: usize) -> arbitrary::Result>> { + let mut out = Vec::with_capacity(n); + for _ in 0..n { + out.push(match u.int_in_range(0u8..=7)? { + 0 => Datum::Null, + 1 => Datum::Int32(i32::arbitrary(u)?), + 2 => Datum::Int64(i64::arbitrary(u)?), + 3 => Datum::UInt8(u8::arbitrary(u)?), + 4 => Datum::from(Numeric::from(i64::arbitrary(u)?)), + 5 => Datum::Interval(Interval::new( + i32::arbitrary(u)?, + i32::arbitrary(u)?, + i64::arbitrary(u)?, + )), + 6 => Datum::Uuid(uuid::Uuid::from_bytes(<[u8; 16]>::arbitrary(u)?)), + _ => Datum::MzTimestamp(Timestamp::from(u64::arbitrary(u)?)), + }); + } + Ok(out) +} + +/// Push a `Range` over `Int32` bounds (possibly empty or with infinite bounds). +fn push_range(packer: &mut RowPacker, u: &mut Unstructured) -> arbitrary::Result<()> { + if bool::arbitrary(u)? { + packer + .push_range(Range { inner: None }) + .expect("empty range is valid"); + return Ok(()); + } + let lo = i32::arbitrary(u)?; + let hi = i32::arbitrary(u)?; + let (lo, hi) = if lo <= hi { (lo, hi) } else { (hi, lo) }; + let lower = RangeBound { + inclusive: bool::arbitrary(u)?, + bound: if bool::arbitrary(u)? { + Some(Datum::Int32(lo)) + } else { + None + }, + }; + let upper = RangeBound { + inclusive: bool::arbitrary(u)?, + bound: if bool::arbitrary(u)? { + Some(Datum::Int32(hi)) + } else { + None + }, + }; + let range = Range { + inner: Some(RangeInner { lower, upper }), + }; + if packer.push_range(range).is_err() { + packer.push(Datum::Int32(lo)); + } + Ok(()) +} + +/// Push one datum: usually a scalar, occasionally a nested composite whose +/// elements are themselves scalars. Always produces a valid `Row`. +fn push_datum(packer: &mut RowPacker, u: &mut Unstructured) -> arbitrary::Result<()> { + match u.int_in_range(0u8..=9)? { + 0..=5 => push_scalar(packer, u), + 6 => { + let n = u.int_in_range(0usize..=4)?; + let datums = gen_scalar_vec(u, n)?; + let dims = [ArrayDimension { + lower_bound: 1, + length: datums.len(), + }]; + packer + .try_push_array(&dims, datums.iter()) + .expect("single-dimension array is always valid"); + Ok(()) + } + 7 => { + let n = u.int_in_range(0usize..=4)?; + let datums = gen_scalar_vec(u, n)?; + packer.push_list(datums.iter()); + Ok(()) + } + 8 => { + let n = u.int_in_range(0usize..=4)?; + let mut keys: Vec = Vec::with_capacity(n); + for _ in 0..n { + let len = u.int_in_range(0usize..=4)?; + let mut s = String::with_capacity(len); + for _ in 0..len { + s.push(if bool::arbitrary(u)? { 'a' } else { 'b' }); + } + keys.push(s); + } + keys.sort(); + keys.dedup(); + let vals = gen_scalar_vec(u, keys.len())?; + packer.push_dict(keys.iter().map(String::as_str).zip(vals.iter().copied())); + Ok(()) + } + _ => push_range(packer, u), + } +} + +/// A small arbitrary `Row` (the untrusted source value), covering the tricky +/// scalar and nested-composite datum space. +fn gen_row(u: &mut Unstructured) -> arbitrary::Result { + let n = u.int_in_range(0usize..=4)?; + let mut row = Row::default(); + let mut packer = row.packer(); + for _ in 0..n { + push_datum(&mut packer, u)?; + } + drop(packer); + Ok(row) +} + +/// An arbitrary `DecodeError` payload for the error-arm variants. +fn gen_decode_error(u: &mut Unstructured) -> arbitrary::Result { + let msg = String::arbitrary(u)?; + let kind = if bool::arbitrary(u)? { + DecodeErrorKind::Text(msg.into_boxed_str()) + } else { + DecodeErrorKind::Bytes(msg.into_boxed_str()) + }; + let raw = Vec::::arbitrary(u)?; + Ok(DecodeError { kind, raw }) +} + +/// An arbitrary `UpsertValue`: usually an `Ok(Row)`, occasionally an +/// `Err(UpsertError)` spanning the whole variant space. +fn gen_value(u: &mut Unstructured) -> arbitrary::Result { + if u.ratio(1u8, 5u8)? { + let err = match u.int_in_range(0u8..=2)? { + 0 => UpsertError::KeyDecode(gen_decode_error(u)?), + 1 => UpsertError::Value(UpsertValueError { + inner: gen_decode_error(u)?, + for_key: gen_row(u)?, + }), + _ => UpsertError::NullKey(UpsertNullKeyError), + }; + Ok(Err(Box::new(err))) + } else { + Ok(Ok(gen_row(u)?)) + } +} + +/// Consolidate a differential output into a `(value, ts) -> net diff` map, +/// dropping the entries that cancel out. Keys on the whole `UpsertValue` so both +/// `Ok` rows and `Err` values are compared faithfully. +fn consolidate(updates: &[(UpsertValue, u64, Diff)]) -> BTreeMap<(UpsertValue, u64), Diff> { + let mut m: BTreeMap<(UpsertValue, u64), Diff> = BTreeMap::new(); + for (value, ts, diff) in updates { + *m.entry((value.clone(), *ts)).or_insert(Diff::ZERO) += *diff; + } + m.retain(|_, d| *d != Diff::ZERO); + m +} + +fn run(u: &mut Unstructured) -> arbitrary::Result<()> { + let parts = PARTS.get_or_init(FuzzUpsertParts::new); + + let n_keys = u.int_in_range(1usize..=4)?; + let n_cmds = u.int_in_range(0usize..=20)?; + + // Build a tiny per-key value pool. Commands draw their value from this small + // pool (or are deletes), so re-inserting the same value and + // delete-then-reinsert of an *identical* value are common, exactly the + // retract/insert exact-cancellation cases that are easy to miss with fresh + // random values every time. + let mut pools: Vec> = Vec::with_capacity(n_keys); + for _ in 0..n_keys { + let pool_size = u.int_in_range(1usize..=3)?; + let mut pool = Vec::with_capacity(pool_size); + for _ in 0..pool_size { + pool.push(gen_value(u)?); + } + pools.push(pool); + } + + // Generate commands `(ts, key_idx, order, value)`. Orders are distinct (the + // generation index), so the max-order-per-(ts,key) winner is unambiguous, + // and multiple commands can target the same (ts, key) to exercise dedup. + let mut commands: Vec<(u64, usize, u64, Option)> = Vec::with_capacity(n_cmds); + for i in 0..n_cmds { + let ts = u.int_in_range(0u64..=4)?; + let key_idx = u.int_in_range(0usize..=n_keys - 1)?; + let value = if u.int_in_range(0u8..=3)? == 0 { + None // a delete + } else { + let pool = &pools[key_idx]; + let pick = u.int_in_range(0usize..=pool.len() - 1)?; + Some(pool[pick].clone()) + }; + commands.push((ts, key_idx, i as u64, value)); + } + + let keys: Vec = (0..n_keys) + .map(|k| { + let key_row = Row::pack_slice(&[Datum::Int64(i64::try_from(k).unwrap())]); + UpsertKey::from_key(Ok(&key_row)) + }) + .collect(); + + let drain_to = commands.iter().map(|(ts, ..)| *ts).max().map_or(0, |m| m + 1); + + let hook_commands: Vec<(u64, UpsertKey, u64, Option)> = commands + .iter() + .map(|(ts, k, o, v)| (*ts, keys[*k], *o, v.clone())) + .collect(); + + let (output, final_state) = CFG.with(|cfg| { + rt().block_on(fuzz_drain_staged_input( + parts, + cfg, + hook_commands, + drain_to, + &keys, + )) + }); + + // --- Oracle --------------------------------------------------------------- + // Per (key, ts), the maximum-order command wins. + let mut winner: BTreeMap<(usize, u64), (u64, Option)> = BTreeMap::new(); + for (ts, k, o, v) in &commands { + let replace = match winner.get(&(*k, *ts)) { + Some((existing_order, _)) => existing_order < o, + None => true, + }; + if replace { + winner.insert((*k, *ts), (*o, v.clone())); + } + } + // Process winners per key in timestamp order (the BTreeMap iterates + // key-major, ts-ascending): retract the prior value, insert the new one. + let mut oracle_output: Vec<(UpsertValue, u64, Diff)> = Vec::new(); + let mut oracle_final: Vec> = vec![None; n_keys]; + let mut cur_key: Option = None; + let mut prev: Option = None; + for ((k, ts), (_order, value)) in &winner { + if cur_key != Some(*k) { + cur_key = Some(*k); + prev = None; // fresh state: this key has no prior value + } + if let Some(p) = &prev { + oracle_output.push((p.clone(), *ts, Diff::MINUS_ONE)); + } + if let Some(nv) = value { + oracle_output.push((nv.clone(), *ts, Diff::ONE)); + } + prev = value.clone(); + oracle_final[*k] = value.clone(); + } + + // --- Checks --------------------------------------------------------------- + assert_eq!( + consolidate(&output), + consolidate(&oracle_output), + "drain_staged_input output diverged from the upsert oracle" + ); + for (i, expected) in oracle_final.iter().enumerate() { + assert_eq!( + &final_state[i], expected, + "final state for key {i} diverged from the upsert oracle" + ); + } + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let mut u = Unstructured::new(data); + let _ = run(&mut u); +}); diff --git a/src/storage/fuzz/fuzz_targets/upsert_state_consolidate.rs b/src/storage/fuzz/fuzz_targets/upsert_state_consolidate.rs new file mode 100644 index 0000000000000..0878367694fa4 --- /dev/null +++ b/src/storage/fuzz/fuzz_targets/upsert_state_consolidate.rs @@ -0,0 +1,379 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: the upsert state machine's snapshot consolidation +//! (`UpsertState::consolidate_chunk` + `multi_get`) over the in-memory backend. +//! +//! Where `upsert_consolidate` exercises the XOR/checksum accumulator in +//! isolation for a single key, this drives the whole state machine: a chunk of +//! differential `(key, value, diff)` updates spanning several keys is fed to +//! `consolidate_chunk` (which re-indexes them through `StateValue` and the +//! backend), the snapshot is completed, and `multi_get` reads each key back. +//! +//! The diffs are Materialize-controlled, so per key we generate a *well-formed* +//! history, zero-or-more canceling `(prev, +1) (prev, -1)` pairs plus at most +//! one current `(cur, +1)`, and shuffle the whole multi-key chunk. After +//! consolidation each key must read back exactly its current value (or absent +//! if it had none): the canceling pairs must XOR/sum away across the chunk +//! regardless of order or of colliding with each other. A panic in +//! `consolidate_chunk`/`multi_get`, or a wrong/lost/invented value, is a finding. +//! +//! The values are drawn from the tricky-encoding datum space (`Numeric`, +//! `Date`, `Timestamp`/`TimestampTz`, `Interval`, `Uuid`, `MzTimestamp`, +//! `Bytes`, long strings, `MzAclItem`, and the nested composites +//! `Array`/`List`/`Map`/`Range`), and a fraction are `Err(UpsertError)` values +//! (real upsert sources stage errored values alongside `Ok` rows), so the +//! state-machine consolidation path is exercised over bincode payloads of widely +//! varying length and over both value arms. + +#![no_main] + +use std::collections::HashMap; +use std::sync::OnceLock; + +use libfuzzer_sys::arbitrary::{self, Arbitrary, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_repr::adt::array::ArrayDimension; +use mz_repr::adt::date::Date; +use mz_repr::adt::interval::Interval; +use mz_repr::adt::mz_acl_item::{AclMode, MzAclItem}; +use mz_repr::adt::numeric::Numeric; +use mz_repr::adt::range::{Range, RangeBound, RangeInner}; +use mz_repr::adt::timestamp::CheckedTimestamp; +use mz_repr::role_id::RoleId; +use mz_repr::{Datum, Diff, GlobalId, Row, RowPacker, Timestamp}; +use mz_storage::fuzz_exports::{ + FuzzUpsertParts, UpsertKey, UpsertValue, UpsertValueAndSize, upsert_bincode_opts, +}; +use mz_storage_types::errors::{ + DecodeError, DecodeErrorKind, UpsertError, UpsertNullKeyError, UpsertValueError, +}; + +use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Utc}; + +fn rt() -> &'static tokio::runtime::Runtime { + static RT: OnceLock = OnceLock::new(); + RT.get_or_init(|| { + tokio::runtime::Builder::new_current_thread() + .build() + .expect("current-thread runtime") + }) +} + +/// An arbitrary in-range `NaiveDateTime`, or `None` if out of the supported range. +fn gen_naive_dt(u: &mut Unstructured) -> arbitrary::Result> { + let year = u.int_in_range(1i32..=9999)?; + let ord = u.int_in_range(1u32..=365)?; + let secs = u.int_in_range(0u32..=86_399)?; + let nanos = u.int_in_range(0u32..=999_999_999)?; + let date = NaiveDate::from_yo_opt(year, ord); + let time = NaiveTime::from_num_seconds_from_midnight_opt(secs, nanos); + Ok(match (date, time) { + (Some(d), Some(t)) => Some(NaiveDateTime::new(d, t)), + _ => None, + }) +} + +fn gen_role_id(u: &mut Unstructured) -> arbitrary::Result { + Ok(match u.int_in_range(0u8..=2)? { + 0 => RoleId::User(u64::arbitrary(u)?), + 1 => RoleId::System(u64::arbitrary(u)?), + _ => RoleId::Public, + }) +} + +/// Push a single arbitrary scalar (non-composite) datum from the tricky space. +fn push_scalar(packer: &mut RowPacker, u: &mut Unstructured) -> arbitrary::Result<()> { + match u.int_in_range(0u8..=14)? { + 0 => packer.push(Datum::Null), + 1 => packer.push(Datum::Int32(i32::arbitrary(u)?)), + 2 => packer.push(Datum::Int64(i64::arbitrary(u)?)), + 3 => packer.push(Datum::UInt8(u8::arbitrary(u)?)), + 4 => packer.push(if bool::arbitrary(u)? { + Datum::True + } else { + Datum::False + }), + 5 => packer.push(Datum::from(Numeric::from(i64::arbitrary(u)?))), + 6 => { + let days = u.int_in_range(Date::LOW_DAYS..=Date::HIGH_DAYS)?; + packer.push(Datum::Date(Date::from_pg_epoch(days).unwrap())); + } + 7 => { + if let Some(dt) = gen_naive_dt(u)? { + packer.push(Datum::Timestamp( + CheckedTimestamp::from_timestamplike(dt).unwrap(), + )); + } else { + packer.push(Datum::Null); + } + } + 8 => { + if let Some(dt) = gen_naive_dt(u)? { + let utc = DateTime::::from_naive_utc_and_offset(dt, Utc); + packer.push(Datum::TimestampTz( + CheckedTimestamp::from_timestamplike(utc).unwrap(), + )); + } else { + packer.push(Datum::Null); + } + } + 9 => packer.push(Datum::Interval(Interval::new( + i32::arbitrary(u)?, + i32::arbitrary(u)?, + i64::arbitrary(u)?, + ))), + 10 => packer.push(Datum::Uuid(uuid::Uuid::from_bytes( + <[u8; 16]>::arbitrary(u)?, + ))), + 11 => packer.push(Datum::MzTimestamp(Timestamp::from(u64::arbitrary(u)?))), + 12 => { + let len = u.int_in_range(0usize..=20)?; + let bytes = u.bytes(len)?; + packer.push(Datum::Bytes(bytes)); + } + 13 => packer.push(Datum::MzAclItem(MzAclItem { + grantee: gen_role_id(u)?, + grantor: gen_role_id(u)?, + acl_mode: AclMode::from_bits_truncate(u64::arbitrary(u)?), + })), + _ => { + let len = u.int_in_range(0usize..=300)?; + let mut s = String::with_capacity(len); + for _ in 0..len { + s.push(if bool::arbitrary(u)? { 'a' } else { 'é' }); + } + packer.push(Datum::String(&s)); + } + } + Ok(()) +} + +/// A vector of `'static`-safe scalar datums for use as composite elements. +fn gen_scalar_vec(u: &mut Unstructured, n: usize) -> arbitrary::Result>> { + let mut out = Vec::with_capacity(n); + for _ in 0..n { + out.push(match u.int_in_range(0u8..=7)? { + 0 => Datum::Null, + 1 => Datum::Int32(i32::arbitrary(u)?), + 2 => Datum::Int64(i64::arbitrary(u)?), + 3 => Datum::UInt8(u8::arbitrary(u)?), + 4 => Datum::from(Numeric::from(i64::arbitrary(u)?)), + 5 => Datum::Interval(Interval::new( + i32::arbitrary(u)?, + i32::arbitrary(u)?, + i64::arbitrary(u)?, + )), + 6 => Datum::Uuid(uuid::Uuid::from_bytes(<[u8; 16]>::arbitrary(u)?)), + _ => Datum::MzTimestamp(Timestamp::from(u64::arbitrary(u)?)), + }); + } + Ok(out) +} + +/// Push a `Range` over `Int32` bounds (possibly empty or with infinite bounds). +fn push_range(packer: &mut RowPacker, u: &mut Unstructured) -> arbitrary::Result<()> { + if bool::arbitrary(u)? { + packer + .push_range(Range { inner: None }) + .expect("empty range is valid"); + return Ok(()); + } + let lo = i32::arbitrary(u)?; + let hi = i32::arbitrary(u)?; + let (lo, hi) = if lo <= hi { (lo, hi) } else { (hi, lo) }; + let lower = RangeBound { + inclusive: bool::arbitrary(u)?, + bound: if bool::arbitrary(u)? { + Some(Datum::Int32(lo)) + } else { + None + }, + }; + let upper = RangeBound { + inclusive: bool::arbitrary(u)?, + bound: if bool::arbitrary(u)? { + Some(Datum::Int32(hi)) + } else { + None + }, + }; + let range = Range { + inner: Some(RangeInner { lower, upper }), + }; + if packer.push_range(range).is_err() { + packer.push(Datum::Int32(lo)); + } + Ok(()) +} + +/// Push one datum: usually a scalar, occasionally a nested composite whose +/// elements are themselves scalars. Always produces a valid `Row`. +fn push_datum(packer: &mut RowPacker, u: &mut Unstructured) -> arbitrary::Result<()> { + match u.int_in_range(0u8..=9)? { + 0..=5 => push_scalar(packer, u), + 6 => { + let n = u.int_in_range(0usize..=4)?; + let datums = gen_scalar_vec(u, n)?; + let dims = [ArrayDimension { + lower_bound: 1, + length: datums.len(), + }]; + packer + .try_push_array(&dims, datums.iter()) + .expect("single-dimension array is always valid"); + Ok(()) + } + 7 => { + let n = u.int_in_range(0usize..=4)?; + let datums = gen_scalar_vec(u, n)?; + packer.push_list(datums.iter()); + Ok(()) + } + 8 => { + let n = u.int_in_range(0usize..=4)?; + let mut keys: Vec = Vec::with_capacity(n); + for _ in 0..n { + let len = u.int_in_range(0usize..=4)?; + let mut s = String::with_capacity(len); + for _ in 0..len { + s.push(if bool::arbitrary(u)? { 'a' } else { 'b' }); + } + keys.push(s); + } + keys.sort(); + keys.dedup(); + let vals = gen_scalar_vec(u, keys.len())?; + packer.push_dict(keys.iter().map(String::as_str).zip(vals.iter().copied())); + Ok(()) + } + _ => push_range(packer, u), + } +} + +/// A small arbitrary `Row` (the untrusted source value), covering the tricky +/// scalar and nested-composite datum space. +fn gen_row(u: &mut Unstructured) -> arbitrary::Result { + let n = u.int_in_range(0usize..=5)?; + let mut row = Row::default(); + let mut packer = row.packer(); + for _ in 0..n { + push_datum(&mut packer, u)?; + } + drop(packer); + Ok(row) +} + +/// An arbitrary `DecodeError` payload for the error-arm variants. +fn gen_decode_error(u: &mut Unstructured) -> arbitrary::Result { + let msg = String::arbitrary(u)?; + let kind = if bool::arbitrary(u)? { + DecodeErrorKind::Text(msg.into_boxed_str()) + } else { + DecodeErrorKind::Bytes(msg.into_boxed_str()) + }; + let raw = Vec::::arbitrary(u)?; + Ok(DecodeError { kind, raw }) +} + +/// An arbitrary `UpsertValue`: usually an `Ok(Row)`, occasionally an +/// `Err(UpsertError)` spanning the whole variant space. +fn gen_value(u: &mut Unstructured) -> arbitrary::Result { + if u.ratio(1u8, 5u8)? { + let err = match u.int_in_range(0u8..=2)? { + 0 => UpsertError::KeyDecode(gen_decode_error(u)?), + 1 => UpsertError::Value(UpsertValueError { + inner: gen_decode_error(u)?, + for_key: gen_row(u)?, + }), + _ => UpsertError::NullKey(UpsertNullKeyError), + }; + Ok(Err(Box::new(err))) + } else { + Ok(Ok(gen_row(u)?)) + } +} + +fn run(u: &mut Unstructured) -> arbitrary::Result<()> { + // The metrics/statistics plumbing is built once and reused across iterations. + static PARTS: OnceLock = OnceLock::new(); + let parts = PARTS.get_or_init(FuzzUpsertParts::new); + let bincode_opts = upsert_bincode_opts(); + + let n_keys = u.int_in_range(1usize..=4)?; + let mut keys: Vec = Vec::with_capacity(n_keys); + let mut expected: HashMap> = HashMap::new(); + let mut chunk: Vec<(UpsertKey, UpsertValue, Diff)> = Vec::new(); + + for k in 0..n_keys { + let key_row = Row::pack_slice(&[Datum::Int32(i32::try_from(k).unwrap())]); + let key = UpsertKey::from_key(Ok(&key_row)); + keys.push(key); + + let n_pairs = u.int_in_range(0usize..=4)?; + for _ in 0..n_pairs { + let prev: UpsertValue = gen_value(u)?; + chunk.push((key, prev.clone(), Diff::ONE)); + chunk.push((key, prev, -Diff::ONE)); + } + if bool::arbitrary(u)? { + let cur = gen_value(u)?; + chunk.push((key, cur.clone(), Diff::ONE)); + expected.insert(key, Some(cur)); + } else { + expected.insert(key, None); + } + } + + // Shuffle the whole multi-key chunk. Consolidation must be order-independent. + for i in (1..chunk.len()).rev() { + let j = u.int_in_range(0..=i)?; + chunk.swap(i, j); + } + + rt().block_on(async { + let mut state = parts.state(); + state + .consolidate_chunk(chunk.into_iter(), true) + .await + .expect("consolidate_chunk should not error on well-formed input"); + + let mut results = vec![UpsertValueAndSize::default(); keys.len()]; + state + .multi_get(keys.iter().copied(), results.iter_mut()) + .await + .expect("multi_get should not error"); + + for (key, result) in keys.iter().zip(results) { + let finalized: Option = match result.value { + None => None, + Some(mut sv) => { + sv.ensure_decoded(bincode_opts, GlobalId::User(0), None); + sv.into_decoded().finalized + } + }; + match (expected.get(key).unwrap(), finalized) { + (Some(cur), Some(got)) => { + assert_eq!(&got, cur, "key consolidated to the wrong value") + } + (Some(_), None) => panic!("consolidation lost a key's value"), + (None, None) => {} + (None, other) => { + panic!("consolidation invented a value for a canceled key: {other:?}") + } + } + } + }); + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let mut u = Unstructured::new(data); + let _ = run(&mut u); +}); diff --git a/src/storage/fuzz/fuzz_targets/upsert_value_roundtrip_v2.rs b/src/storage/fuzz/fuzz_targets/upsert_value_roundtrip_v2.rs new file mode 100644 index 0000000000000..9cccc0f642ead --- /dev/null +++ b/src/storage/fuzz/fuzz_targets/upsert_value_roundtrip_v2.rs @@ -0,0 +1,346 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: the v2 upsert operator's value encoding round-trips. +//! +//! Where the v1 upsert state machine consolidates with the XOR/checksum +//! accumulator (see `upsert_consolidate`), the v2 operator +//! (`upsert_continual_feedback_v2`) stores values in a differential +//! `ValRowSpine` as packed `Row` bytes. An [`UpsertValue`] is +//! `Result>`, so it is folded into a single tagged `Row` +//! by `upsert_value_to_row` (tag `0` + the row's datums for `Ok`, tag `1` + the +//! bincode of the error for `Err`) and recovered from a spine cursor's +//! `DatumSeq` by `datum_seq_to_upsert_value`. A bug in that encode/decode pair +//! silently corrupts the value a v2 upsert source produces, so the round-trip +//! must be the identity for any value (the values come from untrusted source +//! data). +//! +//! We push the encoded `Row` through a real `DatumContainer` (the same byte +//! storage the spine uses) and read it back as a `DatumSeq`, exactly as the +//! operator does, then assert `decode(encode(v)) == v`. +//! +//! Both arms are generated with depth: +//! +//! * the `Ok` arm draws from the *tricky-encoding* datum space (`Numeric`, +//! `Date`, `Timestamp`/`TimestampTz`, `Interval`, `Uuid`, `MzTimestamp`, +//! `Bytes`, long strings, `MzAclItem`, and the nested composites `Array`, +//! `List`, `Map`, and `Range`), so the tag-prefixed re-pack/extend round-trip +//! is exercised over every `Row` tag and over multi-byte length encodings, +//! not just the trivial scalars. +//! * the `Err` arm covers the whole [`UpsertError`] variant space +//! (`KeyDecode`, `Value`, `NullKey`) with non-trivial `DecodeError` payloads +//! and a real decoded key `Row`, so the tag-`1` bincode round-trip is +//! stressed across all error shapes, not just `NullKey`. + +#![no_main] + +use differential_dataflow::trace::implementations::BatchContainer; +use libfuzzer_sys::arbitrary::{self, Arbitrary, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_repr::adt::array::ArrayDimension; +use mz_repr::adt::date::Date; +use mz_repr::adt::interval::Interval; +use mz_repr::adt::mz_acl_item::{AclMode, MzAclItem}; +use mz_repr::adt::numeric::Numeric; +use mz_repr::adt::range::{Range, RangeBound, RangeInner}; +use mz_repr::adt::timestamp::CheckedTimestamp; +use mz_repr::role_id::RoleId; +use mz_repr::{Datum, Row, RowPacker, Timestamp}; +use mz_row_spine::DatumContainer; +use mz_storage::fuzz_exports::{UpsertValue, datum_seq_to_upsert_value, upsert_value_to_row}; +use mz_storage_types::errors::{ + DecodeError, DecodeErrorKind, UpsertError, UpsertNullKeyError, UpsertValueError, +}; +use timely::container::PushInto; + +use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, Utc}; + +/// Push a single arbitrary *scalar* (non-composite) datum, drawn from the +/// tricky-encoding space so we vary the `Row` tag and the bincode length. +fn push_scalar(packer: &mut RowPacker, u: &mut Unstructured) -> arbitrary::Result<()> { + match u.int_in_range(0u8..=15)? { + 0 => packer.push(Datum::Null), + 1 => packer.push(Datum::Int32(i32::arbitrary(u)?)), + 2 => packer.push(Datum::Int64(i64::arbitrary(u)?)), + // `upsert_value_to_row` tags `Ok` values with a leading `UInt8(0)`, so + // include `UInt8` data datums so a value can't be confused with the tag. + 3 => packer.push(Datum::UInt8(u8::arbitrary(u)?)), + 4 => packer.push(if bool::arbitrary(u)? { + Datum::True + } else { + Datum::False + }), + 5 => packer.push(Datum::from(Numeric::from(i64::arbitrary(u)?))), + 6 => { + let days = u.int_in_range(Date::LOW_DAYS..=Date::HIGH_DAYS)?; + packer.push(Datum::Date(Date::from_pg_epoch(days).unwrap())); + } + 7 => { + if let Some(dt) = gen_naive_dt(u)? { + packer.push(Datum::Timestamp(CheckedTimestamp::from_timestamplike(dt).unwrap())); + } else { + packer.push(Datum::Null); + } + } + 8 => { + if let Some(dt) = gen_naive_dt(u)? { + let utc = DateTime::::from_naive_utc_and_offset(dt, Utc); + packer.push(Datum::TimestampTz( + CheckedTimestamp::from_timestamplike(utc).unwrap(), + )); + } else { + packer.push(Datum::Null); + } + } + 9 => packer.push(Datum::Interval(Interval::new( + i32::arbitrary(u)?, + i32::arbitrary(u)?, + i64::arbitrary(u)?, + ))), + 10 => packer.push(Datum::Uuid(uuid::Uuid::from_bytes( + <[u8; 16]>::arbitrary(u)?, + ))), + 11 => packer.push(Datum::MzTimestamp(Timestamp::from(u64::arbitrary(u)?))), + 12 => { + let len = u.int_in_range(0usize..=20)?; + let bytes = u.bytes(len)?; + packer.push(Datum::Bytes(bytes)); + } + 13 => packer.push(Datum::MzAclItem(MzAclItem { + grantee: gen_role_id(u)?, + grantor: gen_role_id(u)?, + acl_mode: AclMode::from_bits_truncate(u64::arbitrary(u)?), + })), + 14 => { + // A string whose length spills past the 1-byte tiny encoding, so the + // multi-byte length path is exercised. Mixes ASCII and multi-byte. + let len = u.int_in_range(0usize..=300)?; + let mut s = String::with_capacity(len); + for _ in 0..len { + s.push(if bool::arbitrary(u)? { 'a' } else { 'é' }); + } + packer.push(Datum::String(&s)); + } + _ => { + let len = u.int_in_range(0usize..=6)?; + let mut s = String::with_capacity(len); + for _ in 0..len { + s.push(if bool::arbitrary(u)? { 'a' } else { 'é' }); + } + packer.push(Datum::String(&s)); + } + } + Ok(()) +} + +/// An arbitrary in-range `NaiveDateTime`, or `None` if out of the supported range. +fn gen_naive_dt(u: &mut Unstructured) -> arbitrary::Result> { + // Stay comfortably inside chrono's representable range. + let year = u.int_in_range(1i32..=9999)?; + let ord = u.int_in_range(1u32..=365)?; + let secs = u.int_in_range(0u32..=86_399)?; + let nanos = u.int_in_range(0u32..=999_999_999)?; + let date = NaiveDate::from_yo_opt(year, ord); + let time = NaiveTime::from_num_seconds_from_midnight_opt(secs, nanos); + Ok(match (date, time) { + (Some(d), Some(t)) => Some(NaiveDateTime::new(d, t)), + _ => None, + }) +} + +fn gen_role_id(u: &mut Unstructured) -> arbitrary::Result { + Ok(match u.int_in_range(0u8..=2)? { + 0 => RoleId::User(u64::arbitrary(u)?), + 1 => RoleId::System(u64::arbitrary(u)?), + _ => RoleId::Public, + }) +} + +/// Push one datum: usually a scalar, occasionally a nested composite +/// (`Array`/`List`/`Map`/`Range`) whose elements are themselves scalars. Falls +/// back to a scalar if a composite can't be constructed for the drawn shape (an +/// invalid range, e.g.), so the caller always ends up with a valid `Row`. +fn push_datum(packer: &mut RowPacker, u: &mut Unstructured) -> arbitrary::Result<()> { + match u.int_in_range(0u8..=9)? { + // Mostly scalars. + 0..=5 => push_scalar(packer, u), + // A 1-D Array of scalars. + 6 => { + let n = u.int_in_range(0usize..=4)?; + let datums = gen_scalar_vec(u, n)?; + let dims = [ArrayDimension { + lower_bound: 1, + length: datums.len(), + }]; + packer + .try_push_array(&dims, datums.iter()) + .expect("single-dimension array is always valid"); + Ok(()) + } + // A List of scalars. + 7 => { + let n = u.int_in_range(0usize..=4)?; + let datums = gen_scalar_vec(u, n)?; + packer.push_list(datums.iter()); + Ok(()) + } + // A Map (dict) with sorted, unique string keys. + 8 => { + let n = u.int_in_range(0usize..=4)?; + let mut keys: Vec = Vec::with_capacity(n); + for _ in 0..n { + let len = u.int_in_range(0usize..=4)?; + let mut s = String::with_capacity(len); + for _ in 0..len { + s.push(if bool::arbitrary(u)? { 'a' } else { 'b' }); + } + keys.push(s); + } + keys.sort(); + keys.dedup(); + let vals = gen_scalar_vec(u, keys.len())?; + packer.push_dict(keys.iter().map(String::as_str).zip(vals.iter().copied())); + Ok(()) + } + // A Range over Int32 bounds. + _ => { + push_range(packer, u)?; + Ok(()) + } + } +} + +/// Generate a vector of `n` scalar datums by packing them into a scratch row and +/// borrowing them back. (Composite packers need an iterator of `Datum`.) +fn gen_scalar_vec<'a>( + u: &mut Unstructured, + n: usize, +) -> arbitrary::Result>> { + // We only emit `Copy`, `'static`-safe scalar datums here so the returned + // `Datum`s don't borrow from a scratch buffer. That covers ints, bools, + // numerics, dates, timestamps, intervals, uuids, and mz-timestamps. + let mut out = Vec::with_capacity(n); + for _ in 0..n { + out.push(match u.int_in_range(0u8..=7)? { + 0 => Datum::Null, + 1 => Datum::Int32(i32::arbitrary(u)?), + 2 => Datum::Int64(i64::arbitrary(u)?), + 3 => Datum::UInt8(u8::arbitrary(u)?), + 4 => Datum::from(Numeric::from(i64::arbitrary(u)?)), + 5 => Datum::Interval(Interval::new( + i32::arbitrary(u)?, + i32::arbitrary(u)?, + i64::arbitrary(u)?, + )), + 6 => Datum::Uuid(uuid::Uuid::from_bytes(<[u8; 16]>::arbitrary(u)?)), + _ => Datum::MzTimestamp(Timestamp::from(u64::arbitrary(u)?)), + }); + } + Ok(out) +} + +/// Push a `Range` over `Int32` bounds (possibly empty or with infinite bounds). +fn push_range(packer: &mut RowPacker, u: &mut Unstructured) -> arbitrary::Result<()> { + if bool::arbitrary(u)? { + // Empty range. + packer + .push_range(Range { inner: None }) + .expect("empty range is valid"); + return Ok(()); + } + let lo = i32::arbitrary(u)?; + let hi = i32::arbitrary(u)?; + let (lo, hi) = if lo <= hi { (lo, hi) } else { (hi, lo) }; + let lower = RangeBound { + inclusive: bool::arbitrary(u)?, + bound: if bool::arbitrary(u)? { + Some(Datum::Int32(lo)) + } else { + None + }, + }; + let upper = RangeBound { + inclusive: bool::arbitrary(u)?, + bound: if bool::arbitrary(u)? { + Some(Datum::Int32(hi)) + } else { + None + }, + }; + let range = Range { + inner: Some(RangeInner { lower, upper }), + }; + // `push_range` canonicalizes and may reject (e.g. a degenerate empty range). + // On rejection just push a plain scalar so the row is still valid. + if packer.push_range(range).is_err() { + packer.push(Datum::Int32(lo)); + } + Ok(()) +} + +/// A small arbitrary `Row` (the untrusted source value), covering the tricky +/// scalar and nested-composite datum space. +fn gen_row(u: &mut Unstructured) -> arbitrary::Result { + let n = u.int_in_range(0usize..=5)?; + let mut row = Row::default(); + let mut packer = row.packer(); + for _ in 0..n { + push_datum(&mut packer, u)?; + } + drop(packer); + Ok(row) +} + +/// An arbitrary `DecodeError` payload for the error-arm variants. +fn gen_decode_error(u: &mut Unstructured) -> arbitrary::Result { + let msg = String::arbitrary(u)?; + let kind = if bool::arbitrary(u)? { + DecodeErrorKind::Text(msg.into_boxed_str()) + } else { + DecodeErrorKind::Bytes(msg.into_boxed_str()) + }; + let raw = Vec::::arbitrary(u)?; + Ok(DecodeError { kind, raw }) +} + +fn gen_value(u: &mut Unstructured) -> arbitrary::Result { + if u.ratio(1u8, 4u8)? { + // Exercise the error arm (tag `1` + bincode of the error) across the + // whole `UpsertError` variant space. + let err = match u.int_in_range(0u8..=2)? { + 0 => UpsertError::KeyDecode(gen_decode_error(u)?), + 1 => UpsertError::Value(UpsertValueError { + inner: gen_decode_error(u)?, + for_key: gen_row(u)?, + }), + _ => UpsertError::NullKey(UpsertNullKeyError), + }; + Ok(Err(Box::new(err))) + } else { + Ok(Ok(gen_row(u)?)) + } +} + +fn run(u: &mut Unstructured) -> arbitrary::Result<()> { + let value: UpsertValue = gen_value(u)?; + + // Encode, store in the spine's byte container, read back, decode. + let row = upsert_value_to_row(&value); + let mut container = DatumContainer::with_capacity(1); + container.push_into(row); + let decoded = datum_seq_to_upsert_value(container.index(0)); + + assert_eq!(value, decoded, "v2 upsert value encoding did not round-trip"); + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let mut u = Unstructured::new(data); + let _ = run(&mut u); +}); diff --git a/src/storage/src/lib.rs b/src/storage/src/lib.rs index 3aa88c8fc241d..cfa02903e5845 100644 --- a/src/storage/src/lib.rs +++ b/src/storage/src/lib.rs @@ -9,7 +9,10 @@ //! Materialize's storage layer. -#![warn(missing_docs)] +// The `fuzzing` feature re-exports internal upsert types (see `fuzz_exports`) +// that are intentionally undocumented. Don't require docs for them in that +// build. The normal public API is still linted. +#![cfg_attr(not(feature = "fuzzing"), warn(missing_docs))] pub mod decode; pub mod internal_control; @@ -24,6 +27,20 @@ pub(crate) mod upsert; mod upsert_continual_feedback; mod upsert_continual_feedback_v2; +/// Internal upsert types re-exported under `cfg(feature = "fuzzing")` so the +/// storage fuzz crate can drive the upsert state machine and value encodings +/// directly. The modules themselves stay crate-private; this facade exposes +/// only the items the fuzz targets need (mirroring `mz-persist-client` and +/// `mz-pgwire`). Not part of the public API. +#[cfg(feature = "fuzzing")] +pub mod fuzz_exports { + pub use crate::upsert::types::{ + FuzzUpsertParts, StateValue, UpsertValueAndSize, upsert_bincode_opts, + }; + pub use crate::upsert::{UpsertKey, UpsertValue, fuzz_drain_staged_input}; + pub use crate::upsert_continual_feedback_v2::{datum_seq_to_upsert_value, upsert_value_to_row}; +} + pub(crate) mod healthcheck; pub use server::serve; diff --git a/src/storage/src/render/persist_sink.rs b/src/storage/src/render/persist_sink.rs index 7a7e4fa6b7127..b915a9fa74b8e 100644 --- a/src/storage/src/render/persist_sink.rs +++ b/src/storage/src/render/persist_sink.rs @@ -121,7 +121,7 @@ use timely::PartialOrder; use timely::container::CapacityContainerBuilder; use timely::dataflow::channels::pact::{Exchange, Pipeline}; use timely::dataflow::operators::vec::Broadcast; -use timely::dataflow::operators::{Capability, CapabilitySet, Inspect}; +use timely::dataflow::operators::{Capability, CapabilitySet, InspectCore}; use timely::dataflow::{Scope, Stream, StreamVec}; use timely::progress::{Antichain, Timestamp}; use tokio::sync::Semaphore; @@ -847,8 +847,21 @@ fn write_batches<'scope>( } }); + // Use `InspectCore::inspect_container` instead of `Inspect::inspect`. + // `Inspect` carries a `where for<'a> &'a C: IntoIterator` bound, and on + // macOS the solver can satisfy that bound by chasing objc2's + // `&Retained: IntoIterator` blanket impl into an endless + // `Retained>` chain, overflowing the recursion limit. + // `InspectCore` has no such bound, so the cascade never starts. We + // iterate the container by hand to recover the per-item callback. let output_stream = if collection_id.is_user() { - output_stream.inspect(|d| trace!("batch: {:?}", d)) + InspectCore::inspect_container(output_stream, |event| { + if let Ok((_, data)) = event { + for d in data { + trace!("batch: {:?}", d); + } + } + }) } else { output_stream }; diff --git a/src/storage/src/source/source_reader_pipeline.rs b/src/storage/src/source/source_reader_pipeline.rs index 5a82b6b596dd2..76fafe5670b3b 100644 --- a/src/storage/src/source/source_reader_pipeline.rs +++ b/src/storage/src/source/source_reader_pipeline.rs @@ -55,7 +55,7 @@ use timely::dataflow::operators::core::Map as _; use timely::dataflow::operators::generic::OutputBuilder; use timely::dataflow::operators::generic::builder_rc::OperatorBuilder as OperatorBuilderRc; use timely::dataflow::operators::vec::Broadcast; -use timely::dataflow::operators::{CapabilitySet, Inspect, Leave}; +use timely::dataflow::operators::{CapabilitySet, InspectCore, Leave}; use timely::dataflow::{Scope, StreamVec}; use timely::order::TotalOrder; use timely::progress::frontier::MutableAntichain; @@ -381,9 +381,20 @@ where // Broadcasting does more work than necessary, which would be to exchange the probes to the // worker that will be the one minting the bindings but we'd have to thread this information // through and couple the two functions enough that it's not worth the optimization (I think). - probe_stream.broadcast().inspect(move |probe| { - // We don't care if the receiver is gone - let _ = probed_upper_tx.send(Some(probe.clone())); + // Use `InspectCore::inspect_container` instead of `Inspect::inspect`. + // `Inspect` carries a `where for<'a> &'a C: IntoIterator` bound, and on + // macOS the solver can satisfy that bound by chasing objc2's + // `&Retained: IntoIterator` blanket impl into an endless + // `Retained>` chain, overflowing the recursion limit. + // `InspectCore` has no such bound, so the cascade never starts. We + // iterate the container by hand to recover the per-item callback. + probe_stream.broadcast().inspect_container(move |event| { + if let Ok((_, data)) = event { + for probe in data { + // We don't care if the receiver is gone + let _ = probed_upper_tx.send(Some(probe.clone())); + } + } }); ( diff --git a/src/storage/src/upsert.rs b/src/storage/src/upsert.rs index 16269fd30fb82..7d74a5d71ad83 100644 --- a/src/storage/src/upsert.rs +++ b/src/storage/src/upsert.rs @@ -55,7 +55,7 @@ use types::{ upsert_bincode_opts, }; -#[cfg(test)] +#[cfg(any(test, feature = "fuzzing"))] pub mod memory; pub(crate) mod rocksdb; // TODO(aljoscha): Move next to upsert module, rename to upsert_types. @@ -969,6 +969,76 @@ async fn drain_staged_input( } } +/// A no-op-ish error emitter for the fuzzing hook. With the in-memory backend +/// and the well-formed inputs the fuzzer builds, `multi_get`/`multi_put` never +/// error, so reaching this is itself a finding. +#[cfg(feature = "fuzzing")] +struct PanicErrorEmitter; + +#[cfg(feature = "fuzzing")] +#[async_trait::async_trait(?Send)] +impl UpsertErrorEmitter for PanicErrorEmitter { + async fn emit(&mut self, context: String, e: anyhow::Error) { + panic!("unexpected upsert state error during fuzzing: {context}: {e}"); + } +} + +/// Fuzzing hook: run a single `drain_staged_input` over `commands` (each a +/// `(timestamp, key, order, value)`, where `value == None` is a delete) against +/// a fresh empty in-memory state, draining everything strictly below +/// `drain_to`. Returns the emitted output updates and the final finalized value +/// of each key in `all_keys`. Exposed only for fuzzing. Not a stable public +/// API. +#[cfg(feature = "fuzzing")] +pub async fn fuzz_drain_staged_input( + parts: &types::FuzzUpsertParts, + source_config: &crate::source::SourceExportCreationConfig, + commands: Vec<(u64, UpsertKey, u64, Option)>, + drain_to: u64, + all_keys: &[UpsertKey], +) -> (Vec<(UpsertValue, u64, Diff)>, Vec>) { + let mut state = parts.state(); + let mut stash: Vec<(u64, UpsertKey, Reverse, Option)> = commands + .into_iter() + .map(|(ts, key, order, value)| (ts, key, Reverse(order), value)) + .collect(); + let mut commands_state = indexmap::IndexMap::new(); + let mut output = Vec::new(); + let mut multi_get_scratch = Vec::new(); + let mut emitter = PanicErrorEmitter; + + drain_staged_input( + &mut stash, + &mut commands_state, + &mut output, + &mut multi_get_scratch, + DrainStyle::ToUpper(&Antichain::from_elem(drain_to)), + &mut emitter, + &mut state, + source_config, + ) + .await; + + let bincode_opts = types::upsert_bincode_opts(); + let mut results = vec![types::UpsertValueAndSize::default(); all_keys.len()]; + state + .multi_get(all_keys.iter().copied(), results.iter_mut()) + .await + .expect("multi_get in fuzz hook should not error"); + let final_state = results + .into_iter() + .map(|r| match r.value { + None => None, + Some(mut sv) => { + sv.ensure_decoded(bincode_opts, GlobalId::User(0), None); + sv.into_decoded().finalized + } + }) + .collect(); + + (output, final_state) +} + // Created a struct to hold the configs for upserts. // So that new configs don't require a new method parameter. pub(crate) struct UpsertConfig { diff --git a/src/storage/src/upsert/types.rs b/src/storage/src/upsert/types.rs index 2bf8270aa2c95..bc1aa0d0427f4 100644 --- a/src/storage/src/upsert/types.rs +++ b/src/storage/src/upsert/types.rs @@ -304,7 +304,7 @@ impl StateValue { /// 2. An estimate (it only looks at value sizes, and not errors) /// /// Other implementations may use more accurate accounting. - #[cfg(test)] + #[cfg(any(test, feature = "fuzzing"))] pub fn memory_size(&self) -> usize { use mz_repr::Row; use std::mem::size_of; @@ -1021,6 +1021,90 @@ impl<'metrics, S, T, O> UpsertState<'metrics, S, T, O> { } } +/// Owns the metrics/statistics plumbing an [`UpsertState`] requires, so a fuzz +/// target can build fresh in-memory `UpsertState`s without reconstructing it +/// each iteration (or leaking it). Construct once. Call [`Self::state`] per +/// iteration. Exposed only for fuzzing. Not a stable public API. +#[cfg(feature = "fuzzing")] +#[doc(hidden)] +pub struct FuzzUpsertParts { + shared: std::sync::Arc, + worker: UpsertMetrics, + stats_defs: crate::statistics::SourceStatisticsMetricDefs, +} + +#[cfg(feature = "fuzzing")] +#[doc(hidden)] +impl FuzzUpsertParts { + pub fn new() -> Self { + let registry = mz_ore::metrics::MetricsRegistry::new(); + let upsert_defs = crate::metrics::upsert::UpsertMetricDefs::register_with(®istry); + let id = GlobalId::User(0); + let shared = upsert_defs.shared(&id); + let worker = UpsertMetrics::new(&upsert_defs, id, 0, None); + let stats_defs = crate::statistics::SourceStatisticsMetricDefs::register_with(®istry); + FuzzUpsertParts { + shared, + worker, + stats_defs, + } + } + + /// A fresh, empty in-memory `UpsertState` borrowing the shared metrics. + pub fn state( + &self, + ) -> UpsertState<'_, crate::upsert::memory::InMemoryHashMap, u64, u64> { + let id = GlobalId::User(0); + let stats = crate::statistics::SourceStatistics::new( + id, + 0, + &self.stats_defs, + id, + &mz_persist_client::ShardId::new(), + mz_storage_types::sources::SourceEnvelope::CdcV2, + timely::progress::Antichain::from_elem(mz_repr::Timestamp::MIN), + ); + UpsertState::new( + crate::upsert::memory::InMemoryHashMap::default(), + std::sync::Arc::clone(&self.shared), + &self.worker, + stats, + 0, + ) + } + + /// A `SourceExportCreationConfig` for driving `drain_staged_input` directly. + /// `drain_staged_input` only reads `.id` from it, but the struct requires the + /// full metrics/statistics plumbing, which this builds. + pub fn source_config(&self) -> crate::source::SourceExportCreationConfig { + let id = GlobalId::User(0); + let registry = mz_ore::metrics::MetricsRegistry::new(); + let metrics = crate::metrics::StorageMetrics::register_with(®istry); + let source_statistics = crate::statistics::SourceStatistics::new( + id, + 0, + &self.stats_defs, + id, + &mz_persist_client::ShardId::new(), + mz_storage_types::sources::SourceEnvelope::CdcV2, + timely::progress::Antichain::from_elem(mz_repr::Timestamp::MIN), + ); + crate::source::SourceExportCreationConfig { + id, + worker_id: 0, + metrics, + source_statistics, + } + } +} + +#[cfg(feature = "fuzzing")] +impl Default for FuzzUpsertParts { + fn default() -> Self { + Self::new() + } +} + impl UpsertState<'_, S, T, O> where S: UpsertStateBackend, diff --git a/src/storage/src/upsert_continual_feedback_v2.rs b/src/storage/src/upsert_continual_feedback_v2.rs index c3782be0b42f4..1885e17f80cf5 100644 --- a/src/storage/src/upsert_continual_feedback_v2.rs +++ b/src/storage/src/upsert_continual_feedback_v2.rs @@ -29,7 +29,7 @@ //! ## Operator loop (each iteration) //! //! 1. **Ingest source data.** Read upsert commands from the source input, -//! wrap each in an [`UpsertDiff`] (carrying a columnar order key projected +//! wrap each in an `UpsertDiff` (carrying a columnar order key projected //! from `FromTime` via [`UpsertSourceTime`] for dedup), and push into the //! source-stash batcher. The batcher is a paged columnar merge batcher: it //! consolidates entries for the same `(key, time)` via the `UpsertDiff` @@ -52,7 +52,7 @@ //! caught up yet. Push back into the batcher for the next iteration. //! - **Already persisted** (below the persist frontier): some writer has //! already advanced the shard past this time, so it is dropped. See -//! [`drain_sealed_input`] for why re-stashing it would strand the data +//! `drain_sealed_input` for why re-stashing it would strand the data //! and pin the output frontier below the shard upper. //! //! 4. **Capability management.** Downgrade the output capability to the @@ -82,6 +82,9 @@ use differential_dataflow::trace::{Batcher, Cursor, Description, TraceReader}; use differential_dataflow::{AsCollection, VecCollection}; use mz_repr::{Datum, Diff, GlobalId, Row}; use mz_row_spine::{ValRowColPagedBuilder, ValRowSpine}; +// Only the fuzzing-gated `datum_seq_to_upsert_value` takes a `DatumSeq`. +#[cfg(feature = "fuzzing")] +use mz_row_spine::DatumSeq; use mz_storage_types::errors::{DataflowError, EnvelopeError, UpsertError}; use mz_timely_util::builder_async::{ AsyncOutputHandle, Event as AsyncEvent, OperatorBuilder as AsyncOperatorBuilder, @@ -273,7 +276,13 @@ type UpsertOutputHandle = /// Encode an [`UpsertValue`] as a `Row` with a leading tag column so both `Ok` /// and `Err` payloads round-trip through `Row` byte storage. -fn upsert_value_to_row(value: &UpsertValue) -> Row { +/// +/// Used on the render path. `pub` only so [`crate::fuzz_exports`] can re-export +/// it under the `fuzzing` feature for the storage fuzz crate. The enclosing +/// module is crate-private, so it is not otherwise reachable. Not a stable +/// public API. +#[doc(hidden)] +pub fn upsert_value_to_row(value: &UpsertValue) -> Row { let mut row = Row::default(); let mut packer = row.packer(); match value { @@ -300,6 +309,17 @@ fn upsert_value_byte_len(value: &UpsertValue) -> usize { } } +/// Decode an [`UpsertValue`] produced by [`upsert_value_to_row`] back from the +/// `DatumSeq` view returned by a `ValRowSpine` cursor. +/// +/// Exists only for the storage fuzz crate, so it is gated behind the `fuzzing` +/// feature. Not a stable public API. +#[cfg(feature = "fuzzing")] +#[doc(hidden)] +pub fn datum_seq_to_upsert_value(seq: DatumSeq<'_>) -> UpsertValue { + decode_upsert_value(seq) +} + /// Decode an [`UpsertValue`] produced by [`upsert_value_to_row`] from any datum /// iterator — a `ValRowSpine` cursor's `DatumSeq` or a stashed `Row`'s `iter`. fn decode_upsert_value<'a>(mut iter: impl Iterator>) -> UpsertValue { diff --git a/src/transform/fuzz/.gitignore b/src/transform/fuzz/.gitignore new file mode 100644 index 0000000000000..ab0eaa1a49031 --- /dev/null +++ b/src/transform/fuzz/.gitignore @@ -0,0 +1,5 @@ +target/ +corpus/ +artifacts/ +coverage/ +Cargo.lock diff --git a/src/transform/fuzz/Cargo.toml b/src/transform/fuzz/Cargo.toml new file mode 100644 index 0000000000000..1e89eb8e5fd81 --- /dev/null +++ b/src/transform/fuzz/Cargo.toml @@ -0,0 +1,43 @@ +# Fuzz crate for mz-transform optimizer transforms (shape/`typ` preservation). +# +# Excluded from the main workspace because libFuzzer requires nightly Rust. +# Run via the repo-wide runner, or locally: +# cd src/transform/fuzz +# cargo +nightly fuzz run mir_relation_transforms -- -max_total_time=60 + +[package] +workspace = "../../../test/cargo-fuzz" +name = "mz-transform-fuzz" +version = "0.0.0" +publish = false +edition = "2021" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" +mz-expr = { path = "../../expr" } +mz-repr = { path = "../../repr" } +mz-transform = { path = ".." } + +[[bin]] +name = "mir_relation_transforms" +path = "fuzz_targets/mir_relation_transforms.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "full_optimizer_equiv" +path = "fuzz_targets/full_optimizer_equiv.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "optimizer_symbolic_equiv" +path = "fuzz_targets/optimizer_symbolic_equiv.rs" +test = false +doc = false +bench = false diff --git a/src/transform/fuzz/fuzz_targets/full_optimizer_equiv.rs b/src/transform/fuzz/fuzz_targets/full_optimizer_equiv.rs new file mode 100644 index 0000000000000..de1951ecf92b2 --- /dev/null +++ b/src/transform/fuzz/fuzz_targets/full_optimizer_equiv.rs @@ -0,0 +1,432 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: the *entire* logical optimizer must preserve a relation's +//! result. Where `mir_relation_transforms` checks individual transforms in +//! isolation, this target runs the full `Optimizer::logical_optimizer` pipeline, +//! every transform, in the real order, with all their interactions, and +//! checks that the optimized plan produces the same rows as the input. +//! +//! We build a random, well-typed plan rooted at `Constant` collections over an +//! `int4`/`int8`/`bool` schema, using the bug-rich relational operators the +//! per-transform target omits: `Join` (over 2-4 inputs, with multiple equi-join +//! equivalences chaining several inputs together), `Reduce` (group keys + +//! min/max/sum/any/all/count aggregates over *computed* inputs, not just bare +//! column refs), `TopK`, `Threshold`, and +//! `Union`/`Negate`/`Distinct`/`Map`/`Filter`/`Project`. Because every leaf is +//! constant and `FoldConstants` evaluates all of these operators, both the input +//! and the optimized output fold to actual result rows. +//! +//! The multi-input joins with several equivalence classes (e.g. `a.x = b.x` and +//! `b.y = c.y`) are what drive the join-ordering/implementation planner, equality +//! propagation, and predicate pushdown through `Get`s, the parts of the optimizer +//! a 2-way, at-most-one-equivalence join barely touches. Computed aggregate inputs +//! likewise exercise aggregate-expression simplification and the reduction MFP. +//! +//! Oracle: fold the input to its `(row, diff)` multiset, run the optimizer, fold +//! the result. When both fold to a constant, the multisets must be equal. A +//! divergence is a miscompile. The comparison is conservative (we only assert +//! when both sides fold, and skip when the optimizer returns an error, e.g. the +//! `Typecheck` pass rejecting a plan shape), so a surviving assertion failure or +//! a panic inside the optimizer is a genuine finding. + +#![no_main] + +use std::collections::BTreeMap; + +use libfuzzer_sys::arbitrary::{self, Arbitrary, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_expr::{ + AggregateExpr, AggregateFunc, ColumnOrder, EvalError, MirRelationExpr, MirScalarExpr, func, +}; +use mz_repr::optimize::OptimizerFeatures; +use mz_repr::{Datum, Diff, GlobalId, ReprColumnType, ReprRelationType, ReprScalarType, Row}; +use mz_transform::dataflow::DataflowMetainfo; +use mz_transform::fold_constants::FoldConstants; +use mz_transform::{Optimizer, TransformCtx, typecheck}; + +#[derive(Clone, Copy, PartialEq)] +enum Ty { + Int32, + Int64, + Bool, +} + +fn scalar_ty(ty: Ty) -> ReprScalarType { + match ty { + Ty::Int32 => ReprScalarType::Int32, + Ty::Int64 => ReprScalarType::Int64, + Ty::Bool => ReprScalarType::Bool, + } +} + +fn rand_ty(u: &mut Unstructured) -> arbitrary::Result { + Ok(match u.int_in_range(0u8..=2)? { + 0 => Ty::Int32, + 1 => Ty::Int64, + _ => Ty::Bool, + }) +} + +fn gen_datum(u: &mut Unstructured, ty: Ty) -> arbitrary::Result> { + if u.ratio(1u8, 5u8)? { + return Ok(Datum::Null); + } + Ok(match ty { + Ty::Int32 => Datum::Int32(i32::arbitrary(u)?), + Ty::Int64 => Datum::Int64(i64::arbitrary(u)?), + Ty::Bool => { + if bool::arbitrary(u)? { + Datum::True + } else { + Datum::False + } + } + }) +} + +fn cols_of(schema: &[Ty], ty: Ty) -> Vec { + schema + .iter() + .enumerate() + .filter(|(_, t)| **t == ty) + .map(|(i, _)| i) + .collect() +} + +/// A shallow, well-typed scalar expression of type `ty` over `schema`. +fn gen_scalar( + u: &mut Unstructured, + ty: Ty, + schema: &[Ty], + depth: u32, +) -> arbitrary::Result { + let st = scalar_ty(ty); + if depth == 0 || u.ratio(1u8, 2u8)? { + let cols = cols_of(schema, ty); + if !cols.is_empty() && bool::arbitrary(u)? { + let idx = u.int_in_range(0..=cols.len() - 1)?; + return Ok(MirScalarExpr::column(cols[idx])); + } + return Ok(match u.int_in_range(0u8..=2)? { + 0 => MirScalarExpr::literal_ok(gen_datum(u, ty)?, st), + 1 => MirScalarExpr::literal_null(st), + _ => MirScalarExpr::literal(Err(EvalError::DivisionByZero), st), + }); + } + let d = depth - 1; + Ok(match ty { + Ty::Int32 => { + let a = gen_scalar(u, Ty::Int32, schema, d)?; + let b = gen_scalar(u, Ty::Int32, schema, d)?; + match u.int_in_range(0u8..=2)? { + 0 => a.call_binary(b, func::AddInt32), + 1 => a.call_binary(b, func::SubInt32), + _ => a.call_binary(b, func::MulInt32), + } + } + Ty::Int64 => { + let a = gen_scalar(u, Ty::Int64, schema, d)?; + let b = gen_scalar(u, Ty::Int64, schema, d)?; + match u.int_in_range(0u8..=2)? { + 0 => a.call_binary(b, func::AddInt64), + 1 => a.call_binary(b, func::SubInt64), + _ => a.call_binary(b, func::MulInt64), + } + } + Ty::Bool => match u.int_in_range(0u8..=3)? { + 0 => gen_scalar(u, Ty::Bool, schema, d)?.and(gen_scalar(u, Ty::Bool, schema, d)?), + 1 => gen_scalar(u, Ty::Bool, schema, d)?.or(gen_scalar(u, Ty::Bool, schema, d)?), + 2 => gen_scalar(u, Ty::Bool, schema, d)?.not(), + _ => { + let t = rand_ty(u)?; + let a = gen_scalar(u, t, schema, d)?; + let b = gen_scalar(u, t, schema, d)?; + a.call_binary(b, func::Eq) + } + }, + }) +} + +fn gen_constant(u: &mut Unstructured) -> arbitrary::Result<(MirRelationExpr, Vec)> { + let ncols = u.int_in_range(1usize..=3)?; + let schema: Vec = (0..ncols) + .map(|_| rand_ty(u)) + .collect::>()?; + let col_types: Vec = schema + .iter() + .map(|t| scalar_ty(*t).nullable(true)) + .collect(); + let nrows = u.int_in_range(0usize..=4)?; + let mut rows = Vec::with_capacity(nrows); + for _ in 0..nrows { + let mut row = Vec::with_capacity(ncols); + for t in &schema { + row.push(gen_datum(u, *t)?); + } + rows.push(row); + } + Ok(( + MirRelationExpr::constant(rows, ReprRelationType::new(col_types)), + schema, + )) +} + +/// One aggregate over `schema`, plus the scalar type of its output column. +/// +/// The aggregated input is a freshly generated scalar expression of the +/// function's required input type (not just a bare column reference), so the +/// reduction sees `max(a + b)`, `sum(if p then x else y)`, etc., exercising +/// aggregate-input simplification and the reduce MFP. +fn gen_aggregate(u: &mut Unstructured, schema: &[Ty]) -> arbitrary::Result<(AggregateExpr, Ty)> { + // (func, required input type, output type). + let opts: &[(AggregateFunc, Ty, Ty)] = &[ + (AggregateFunc::MaxInt32, Ty::Int32, Ty::Int32), + (AggregateFunc::MinInt32, Ty::Int32, Ty::Int32), + (AggregateFunc::SumInt32, Ty::Int32, Ty::Int64), + (AggregateFunc::MaxInt64, Ty::Int64, Ty::Int64), + (AggregateFunc::MinInt64, Ty::Int64, Ty::Int64), + (AggregateFunc::Any, Ty::Bool, Ty::Bool), + (AggregateFunc::All, Ty::Bool, Ty::Bool), + (AggregateFunc::Count, Ty::Int32, Ty::Int64), + ]; + let idx = u.int_in_range(0..=opts.len() - 1)?; + let (func, in_ty, out) = opts[idx].clone(); + // A computed input of the required type. The aggregate `expr` can be any + // well-typed scalar, not just a column. Depth keeps it bounded. + let expr = gen_scalar(u, in_ty, schema, 2)?; + Ok(( + AggregateExpr { + func, + expr, + distinct: bool::arbitrary(u)?, + }, + out, + )) +} + +fn gen_rel(u: &mut Unstructured, depth: u32) -> arbitrary::Result<(MirRelationExpr, Vec)> { + if depth == 0 || u.ratio(2u8, 5u8)? { + return gen_constant(u); + } + let (inner, schema) = gen_rel(u, depth - 1)?; + let arity = schema.len(); + Ok(match u.int_in_range(0u8..=9)? { + // Filter + 0 => { + let n = u.int_in_range(1usize..=2)?; + let preds = (0..n) + .map(|_| gen_scalar(u, Ty::Bool, &schema, 2)) + .collect::>>()?; + (inner.filter(preds), schema) + } + // Map one column + 1 => { + let ty = rand_ty(u)?; + let e = gen_scalar(u, ty, &schema, 2)?; + let mut s = schema.clone(); + s.push(ty); + (inner.map(vec![e]), s) + } + // Project a (reordered/duplicated) subset + 2 => { + let k = u.int_in_range(1usize..=arity)?; + let mut outputs = Vec::with_capacity(k); + for _ in 0..k { + outputs.push(u.int_in_range(0..=arity - 1)?); + } + let s = outputs.iter().map(|&i| schema[i]).collect(); + (inner.project(outputs), s) + } + 3 => (inner.negate(), schema), + 4 => (inner.distinct(), schema), + 5 => (inner.threshold(), schema), + // Union with a same-schema relation (self, or self negated). + 6 => { + let other = if bool::arbitrary(u)? { + inner.clone() + } else { + inner.clone().negate() + }; + (inner.union(other), schema) + } + // Join 2-4 relations with multiple equi-join equivalence classes that + // chain inputs together (e.g. `in0.x = in1.x` and `in1.y = in2.y`). This + // is what makes join ordering/implementation planning and equality + // propagation actually run, unlike a 2-way single-equivalence join. + 7 => { + let n_extra = u.int_in_range(1usize..=3)?; + let mut inputs = vec![inner]; + // Per-input absolute schema, used only to find type-matching join cols. + let mut input_schemas = vec![schema.clone()]; + for _ in 0..n_extra { + let (other, oschema) = gen_rel(u, depth - 1)?; + input_schemas.push(oschema); + inputs.push(other); + } + // For each newly added input `r`, try to add one equivalence per type + // linking it to some earlier input `l < r` with a column of that type. + let mut variables: Vec> = Vec::new(); + for r in 1..inputs.len() { + for ty in [Ty::Int32, Ty::Int64, Ty::Bool] { + let rc = cols_of(&input_schemas[r], ty); + if rc.is_empty() || !bool::arbitrary(u)? { + continue; + } + // Pick an earlier input that also has a column of this type. + let candidates: Vec = (0..r) + .filter(|&l| !cols_of(&input_schemas[l], ty).is_empty()) + .collect(); + if candidates.is_empty() { + continue; + } + let l = candidates[u.int_in_range(0..=candidates.len() - 1)?]; + let lc = cols_of(&input_schemas[l], ty); + let li = lc[u.int_in_range(0..=lc.len() - 1)?]; + let rj = rc[u.int_in_range(0..=rc.len() - 1)?]; + variables.push(vec![(l, li), (r, rj)]); + } + } + let mut s = schema.clone(); + for os in &input_schemas[1..] { + s.extend(os.iter().copied()); + } + (MirRelationExpr::join(inputs, variables), s) + } + // Reduce: a distinct subset group key plus 0..=2 aggregates. + 8 => { + let mut group_key = Vec::new(); + for c in 0..arity { + if bool::arbitrary(u)? { + group_key.push(c); + } + } + let n_agg = u.int_in_range(0usize..=2)?; + let mut aggregates = Vec::with_capacity(n_agg); + let mut out: Vec = group_key.iter().map(|&k| schema[k]).collect(); + for _ in 0..n_agg { + let (a, t) = gen_aggregate(u, &schema)?; + aggregates.push(a); + out.push(t); + } + if group_key.is_empty() && aggregates.is_empty() { + aggregates.push(AggregateExpr { + func: AggregateFunc::Count, + expr: MirScalarExpr::column(0), + distinct: false, + }); + out.push(Ty::Int64); + } + (inner.reduce(group_key, aggregates, None), out) + } + // TopK over the input. + _ => { + let mut group_key = Vec::new(); + for c in 0..arity { + if u.ratio(1u8, 3u8)? { + group_key.push(c); + } + } + // Order by *every* column (in a random direction each) so the order + // is total: distinct rows never tie, hence which rows a LIMIT/OFFSET + // keeps is unambiguous and the result multiset is deterministic. (A + // partial order would let the optimizer legitimately keep different + // tied rows, a spurious divergence rather than a bug.) + let mut order_key = Vec::with_capacity(arity); + for column in 0..arity { + order_key.push(ColumnOrder { + column, + desc: bool::arbitrary(u)?, + nulls_last: bool::arbitrary(u)?, + }); + } + let limit = if bool::arbitrary(u)? { + Some(MirScalarExpr::literal_ok( + Datum::Int64(u.int_in_range(0i64..=3)?), + ReprScalarType::Int64, + )) + } else { + None + }; + let offset = u.int_in_range(0usize..=2)?; + ( + inner.top_k(group_key, order_key, limit, offset, None), + schema, + ) + } + }) +} + +/// Fold `rel`. If it reduces to a `Constant` of `Ok` rows, return the +/// consolidated `(row, diff)` multiset, otherwise `None`. +fn fold_to_multiset(mut rel: MirRelationExpr) -> Option> { + let mut typ = rel.typ(); + (FoldConstants { limit: None }) + .action(&mut rel, &mut typ) + .ok()?; + let (Ok(rows), _) = rel.as_const()? else { + return None; + }; + let mut multiset: BTreeMap = BTreeMap::new(); + for (row, diff) in rows { + *multiset.entry(row.clone()).or_insert(Diff::ZERO) += *diff; + } + multiset.retain(|_, d| *d != Diff::ZERO); + Some(multiset) +} + +/// Run the full logical optimizer. Returns `None` if it errors (e.g. the +/// `Typecheck` pass rejects the plan). Only a panic is a finding here. +#[allow(deprecated)] +fn optimize(rel: MirRelationExpr) -> Option { + let features = OptimizerFeatures::default(); + let typecheck_ctx = typecheck::empty_typechecking_context(); + let mut df_meta = DataflowMetainfo::default(); + let mut ctx = TransformCtx::local( + &features, + &typecheck_ctx, + &mut df_meta, + None, + Some(GlobalId::Transient(1)), + ); + let optimizer = Optimizer::logical_optimizer(&mut ctx); + optimizer + .optimize(rel, &mut ctx) + .ok() + .map(|o| o.into_inner()) +} + +fn run(u: &mut Unstructured) -> arbitrary::Result<()> { + let (rel, _schema) = gen_rel(u, 4)?; + + // The input must fold to actual rows for there to be anything to compare. + let Some(baseline) = fold_to_multiset(rel.clone()) else { + return Ok(()); + }; + + let Some(optimized) = optimize(rel.clone()) else { + return Ok(()); + }; + + // The optimizer is semantics-preserving: the optimized plan must fold to the + // same multiset. We only assert when the optimized plan also folds (it should, + // since all leaves are constant), staying conservative about fold limitations. + if let Some(after) = fold_to_multiset(optimized) { + assert_eq!( + baseline, after, + "the optimizer changed the result multiset\n{rel:?}" + ); + } + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let mut u = Unstructured::new(data); + let _ = run(&mut u); +}); diff --git a/src/transform/fuzz/fuzz_targets/mir_relation_transforms.rs b/src/transform/fuzz/fuzz_targets/mir_relation_transforms.rs new file mode 100644 index 0000000000000..edba433f7ff92 --- /dev/null +++ b/src/transform/fuzz/fuzz_targets/mir_relation_transforms.rs @@ -0,0 +1,445 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: optimizer transforms on `MirRelationExpr` must preserve a +//! relation's shape and results. We build a random, well-typed plan rooted at +//! `Constant` collections (so it is fully constant-foldable and every scalar +//! subexpression is well-typed by construction), then for each transform check: +//! +//! 1. Shape preservation: arity and per-column scalar types are unchanged +//! (nullability and keys may be refined). +//! 2. Result equivalence: because the plan is fully constant, `FoldConstants` +//! evaluates it to a `Constant`, giving the actual result rows. Folding the +//! transformed plan must yield the same consolidated `(row, diff)` multiset +//! as folding the original plan, a genuine correctness check. +//! +//! Transforms exercised: `FoldConstants` itself, `CanonicalizeMfp` (Map/Filter/ +//! Project chains), `UnionBranchCancellation`, the structural fusions +//! (`Filter`/`Project`/`Map`/`Negate`/`Union`) and `ProjectionExtraction`, plus +//! a hand-written semantics-preserving structural rewrite. +//! +//! Generation richness: +//! +//! * Three column types, `int4`, `int8`, `bool`, so column refs, casts and the +//! arithmetic operators all have more than one width to mix. +//! * Richer scalars: `Add`/`Sub`/`Mul`/`Mod` per integer width, `If`/`then`/`else`, +//! `And`/`Or`/`Not`/`Eq`, and the `int4`<->`int8`/`int4`<->`bool` casts, so the +//! Map/Filter/`CanonicalizeMfp` paths see real expression trees rather than a +//! single `AddInt32`. +//! * `Union` branches that are NOT simply `x ∪ -x`: a branch is unioned with a +//! `Map`/`Filter`/`Project`-wrapped negation of an *equal* branch (so the real +//! `compare_branches` recursion through interleaved structural ops must run to +//! detect the `Inverse`), interspersed with a genuinely distinct extra branch +//! that must NOT cancel. This drives `UnionBranchCancellation`'s matching logic +//! instead of only its top-level `x ∪ -x` fast path. + +#![no_main] + +use std::collections::BTreeMap; + +use libfuzzer_sys::arbitrary::{self, Arbitrary, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_expr::{EvalError, MirRelationExpr, MirScalarExpr, func}; +use mz_repr::{Datum, Diff, ReprColumnType, ReprRelationType, ReprScalarType, Row}; +use mz_transform::canonicalization::ProjectionExtraction; +use mz_transform::canonicalize_mfp::CanonicalizeMfp; +use mz_transform::fold_constants::FoldConstants; +use mz_transform::fusion; +use mz_transform::union_cancel::UnionBranchCancellation; + +#[derive(Clone, Copy, PartialEq)] +enum Ty { + Int32, + Int64, + Bool, +} + +fn scalar_ty(ty: Ty) -> ReprScalarType { + match ty { + Ty::Int32 => ReprScalarType::Int32, + Ty::Int64 => ReprScalarType::Int64, + Ty::Bool => ReprScalarType::Bool, + } +} + +fn rand_ty(u: &mut Unstructured) -> arbitrary::Result { + Ok(match u.int_in_range(0u8..=2)? { + 0 => Ty::Int32, + 1 => Ty::Int64, + _ => Ty::Bool, + }) +} + +fn gen_datum(u: &mut Unstructured, ty: Ty) -> arbitrary::Result> { + if u.ratio(1u8, 5u8)? { + return Ok(Datum::Null); + } + Ok(match ty { + Ty::Int32 => Datum::Int32(i32::arbitrary(u)?), + Ty::Int64 => Datum::Int64(i64::arbitrary(u)?), + Ty::Bool => { + if bool::arbitrary(u)? { + Datum::True + } else { + Datum::False + } + } + }) +} + +fn cols_of(schema: &[Ty], ty: Ty) -> Vec { + schema + .iter() + .enumerate() + .filter(|(_, t)| **t == ty) + .map(|(i, _)| i) + .collect() +} + +/// A well-typed scalar expression of type `ty` over a relation with column types +/// `schema`. Column references only target columns of the requested type, and +/// every leaf may also be a literal (constant, null, or a poison error to +/// exercise error-propagation paths). Includes `Add`/`Sub`/`Mul`/`Mod` per +/// integer width, the boolean connectives, `Eq` across a random type, `If`, and +/// the `int4`<->`int8`/`int4`<->`bool` casts so neither integer width is a leaf. +fn gen_scalar( + u: &mut Unstructured, + ty: Ty, + schema: &[Ty], + depth: u32, +) -> arbitrary::Result { + let st = scalar_ty(ty); + if depth == 0 || u.ratio(1u8, 2u8)? { + let cols = cols_of(schema, ty); + if !cols.is_empty() && bool::arbitrary(u)? { + let idx = u.int_in_range(0..=cols.len() - 1)?; + return Ok(MirScalarExpr::column(cols[idx])); + } + return Ok(match u.int_in_range(0u8..=2)? { + 0 => MirScalarExpr::literal_ok(gen_datum(u, ty)?, st), + 1 => MirScalarExpr::literal_null(st), + _ => MirScalarExpr::literal(Err(EvalError::DivisionByZero), st), + }); + } + let d = depth - 1; + // An `If`/`then`/`else` of the requested type, available for every `ty`. + let gen_if = |u: &mut Unstructured| -> arbitrary::Result { + let c = gen_scalar(u, Ty::Bool, schema, d)?; + let t = gen_scalar(u, ty, schema, d)?; + let f = gen_scalar(u, ty, schema, d)?; + Ok(c.if_then_else(t, f)) + }; + Ok(match ty { + Ty::Int32 => match u.int_in_range(0u8..=5)? { + 0 => gen_scalar(u, Ty::Int32, schema, d)? + .call_binary(gen_scalar(u, Ty::Int32, schema, d)?, func::AddInt32), + 1 => gen_scalar(u, Ty::Int32, schema, d)? + .call_binary(gen_scalar(u, Ty::Int32, schema, d)?, func::SubInt32), + 2 => gen_scalar(u, Ty::Int32, schema, d)? + .call_binary(gen_scalar(u, Ty::Int32, schema, d)?, func::MulInt32), + 3 => gen_scalar(u, Ty::Int32, schema, d)? + .call_binary(gen_scalar(u, Ty::Int32, schema, d)?, func::ModInt32), + // Narrowing cast from int8 (may error on overflow, folds to an error). + 4 => gen_scalar(u, Ty::Int64, schema, d)?.call_unary(func::CastInt64ToInt32), + _ => gen_if(u)?, + }, + Ty::Int64 => match u.int_in_range(0u8..=5)? { + 0 => gen_scalar(u, Ty::Int64, schema, d)? + .call_binary(gen_scalar(u, Ty::Int64, schema, d)?, func::AddInt64), + 1 => gen_scalar(u, Ty::Int64, schema, d)? + .call_binary(gen_scalar(u, Ty::Int64, schema, d)?, func::SubInt64), + 2 => gen_scalar(u, Ty::Int64, schema, d)? + .call_binary(gen_scalar(u, Ty::Int64, schema, d)?, func::MulInt64), + 3 => gen_scalar(u, Ty::Int64, schema, d)? + .call_binary(gen_scalar(u, Ty::Int64, schema, d)?, func::ModInt64), + // Widening cast from int4. + 4 => gen_scalar(u, Ty::Int32, schema, d)?.call_unary(func::CastInt32ToInt64), + _ => gen_if(u)?, + }, + Ty::Bool => match u.int_in_range(0u8..=5)? { + 0 => gen_scalar(u, Ty::Bool, schema, d)?.and(gen_scalar(u, Ty::Bool, schema, d)?), + 1 => gen_scalar(u, Ty::Bool, schema, d)?.or(gen_scalar(u, Ty::Bool, schema, d)?), + 2 => gen_scalar(u, Ty::Bool, schema, d)?.not(), + 3 => { + let t = rand_ty(u)?; + let a = gen_scalar(u, t, schema, d)?; + let b = gen_scalar(u, t, schema, d)?; + a.call_binary(b, func::Eq) + } + // Cast int4 -> bool (nonzero is true). + 4 => gen_scalar(u, Ty::Int32, schema, d)?.call_unary(func::CastInt32ToBool), + _ => gen_if(u)?, + }, + }) +} + +fn gen_constant(u: &mut Unstructured) -> arbitrary::Result<(MirRelationExpr, Vec)> { + let ncols = u.int_in_range(1usize..=3)?; + let schema: Vec = (0..ncols) + .map(|_| rand_ty(u)) + .collect::>()?; + let col_types: Vec = schema + .iter() + .map(|t| scalar_ty(*t).nullable(true)) + .collect(); + let nrows = u.int_in_range(0usize..=4)?; + let mut rows = Vec::with_capacity(nrows); + for _ in 0..nrows { + let mut row = Vec::with_capacity(ncols); + for t in &schema { + row.push(gen_datum(u, *t)?); + } + rows.push(row); + } + Ok(( + MirRelationExpr::constant(rows, ReprRelationType::new(col_types)), + schema, + )) +} + +fn gen_rel(u: &mut Unstructured, depth: u32) -> arbitrary::Result<(MirRelationExpr, Vec)> { + if depth == 0 || u.ratio(2u8, 5u8)? { + return gen_constant(u); + } + let (inner, schema) = gen_rel(u, depth - 1)?; + Ok(match u.int_in_range(0u8..=5)? { + // Filter: 1-2 boolean predicates over the input columns, shape unchanged. + 0 => { + let n = u.int_in_range(1usize..=2)?; + let preds = (0..n) + .map(|_| gen_scalar(u, Ty::Bool, &schema, 2)) + .collect::>>()?; + (inner.filter(preds), schema) + } + // Map: append one computed column. + 1 => { + let ty = rand_ty(u)?; + let e = gen_scalar(u, ty, &schema, 2)?; + let mut s = schema.clone(); + s.push(ty); + (inner.map(vec![e]), s) + } + // Project: pick a (possibly reordered/duplicated) subset of columns. + 2 => { + let len = schema.len(); + let k = u.int_in_range(1usize..=len)?; + let mut outputs = Vec::with_capacity(k); + for _ in 0..k { + outputs.push(u.int_in_range(0..=len - 1)?); + } + let s = outputs.iter().map(|&i| schema[i]).collect(); + (inner.project(outputs), s) + } + 3 => (inner.negate(), schema), + 4 => (inner.distinct(), schema), + // Union `inner` with a cancelling counterpart. Instead of the trivial + // `inner ∪ -inner`, the counterpart is `inner` wrapped in a random chain + // of Map/Filter/Project/Negate carrying an *odd* number of Negates, with + // the *same* scalars on both sides, so `UnionBranchCancellation`'s + // `compare_branches` recursion through interleaved structural ops must run + // to recognize the `Inverse`. A genuinely distinct extra branch (with a + // fresh predicate) is interspersed and must NOT cancel. The schema is + // unchanged and the cancelling pair sums to zero, so the result is just + // `distinct`'s contribution, unaffected by whether the transform fires. + _ => { + // Build a Map/Filter/Project wrapper applied identically to two clones + // of `inner`, with an extra Negate on exactly one of them so the pair + // cancels. `compare_branches` requires the structural ops to appear in + // the same order with equal arguments, which this construction + // guarantees by replaying the same recorded steps on both sides. + enum Step { + Map(MirScalarExpr), + Filter(MirScalarExpr), + Negate, + } + let n_steps = u.int_in_range(0usize..=3)?; + let mut steps = Vec::with_capacity(n_steps); + let mut wrapped_schema = schema.clone(); + for _ in 0..n_steps { + match u.int_in_range(0u8..=2)? { + 0 => { + let ty = rand_ty(u)?; + let e = gen_scalar(u, ty, &wrapped_schema, 2)?; + wrapped_schema.push(ty); + steps.push(Step::Map(e)); + } + 1 => steps.push(Step::Filter(gen_scalar(u, Ty::Bool, &wrapped_schema, 2)?)), + _ => steps.push(Step::Negate), + } + } + let replay = |mut rel: MirRelationExpr, extra_negate: bool| { + if extra_negate { + rel = rel.negate(); + } + for step in &steps { + rel = match step { + Step::Map(e) => rel.map(vec![e.clone()]), + Step::Filter(p) => rel.filter(vec![p.clone()]), + Step::Negate => rel.negate(), + }; + } + rel + }; + // The wrapped pair shares the wrapped schema. Project both back to the + // original arity so the whole union keeps `inner`'s schema. + let proj: Vec = (0..schema.len()).collect(); + let left = replay(inner.clone(), false).project(proj.clone()); + let right = replay(inner.clone(), true).project(proj.clone()); + // An extra, genuinely-distinct branch that must survive cancellation: + // `inner` filtered by a fresh predicate. + let distinct_pred = gen_scalar(u, Ty::Bool, &schema, 2)?; + let extra = inner.clone().filter(vec![distinct_pred]); + // Randomize branch order so the matcher's position search is exercised + // (`.union` flattens, so this yields a single 3-input `Union`). + let [b0, b1, b2] = match u.int_in_range(0u8..=2)? { + 0 => [right, extra, left], + 1 => [extra, left, right], + _ => [left, right, extra], + }; + (b0.union(b1).union(b2), schema) + } + }) +} + +/// Fold `rel`. If it reduced to a `Constant` with `Ok` rows, return the +/// consolidated `(row, diff)` multiset, otherwise `None` (fold errored or didn't +/// fully reduce, so there is nothing to compare). +fn fold_to_multiset(mut rel: MirRelationExpr) -> Option> { + let mut typ = rel.typ(); + (FoldConstants { limit: None }) + .action(&mut rel, &mut typ) + .ok()?; + let (Ok(rows), _) = rel.as_const()? else { + return None; + }; + let mut multiset: BTreeMap = BTreeMap::new(); + for (row, diff) in rows { + *multiset.entry(row.clone()).or_insert(Diff::ZERO) += *diff; + } + multiset.retain(|_, d| *d != Diff::ZERO); + Some(multiset) +} + +/// Wrap `rel` in a transformation that preserves its `(row, diff)` multiset. +fn wrap_preserving( + u: &mut Unstructured, + rel: MirRelationExpr, + arity: usize, +) -> arbitrary::Result { + let identity = || (0..arity).collect::>(); + Ok(match u.int_in_range(0u8..=3)? { + 0 => rel.project(identity()), + 1 => rel.filter(vec![MirScalarExpr::literal_true()]), + 2 => rel.negate().negate(), + _ => rel + .map(vec![MirScalarExpr::literal_true()]) + .project(identity()), + }) +} + +fn assert_shape( + before: &ReprRelationType, + after: &ReprRelationType, + who: &str, + rel: &MirRelationExpr, +) { + assert_eq!( + before.column_types.len(), + after.column_types.len(), + "{who} changed the number of columns:\n{rel:?}" + ); + for (b, a) in before.column_types.iter().zip(after.column_types.iter()) { + assert_eq!( + b.scalar_type, a.scalar_type, + "{who} changed a column's scalar type:\n{rel:?}" + ); + } +} + +/// If both the baseline and the transformed plan fold to constants, require the +/// result multisets to match. +fn assert_same_rows( + baseline: &Option>, + transformed: MirRelationExpr, + who: &str, + orig: &MirRelationExpr, +) { + if let (Some(b), Some(t)) = (baseline.as_ref(), fold_to_multiset(transformed)) { + assert_eq!(*b, t, "{who} changed the fold result:\n{orig:?}"); + } +} + +fn run(u: &mut Unstructured) -> arbitrary::Result<()> { + let (rel, schema) = gen_rel(u, 5)?; + let baseline = fold_to_multiset(rel.clone()); + + // A hand-written semantics-preserving structural rewrite. + let rewrite = wrap_preserving(u, rel.clone(), schema.len())?; + assert_same_rows(&baseline, rewrite, "structural rewrite", &rel); + + // CanonicalizeMfp: canonicalizes Map/Filter/Project chains. + { + let mut r = rel.clone(); + let before = r.typ(); + if CanonicalizeMfp.action(&mut r).is_ok() { + assert_shape(&before, &r.typ(), "CanonicalizeMfp", &rel); + assert_same_rows(&baseline, r, "CanonicalizeMfp", &rel); + } + } + + // UnionBranchCancellation: cancels a branch unioned with its negation. + { + let mut r = rel.clone(); + let before = r.typ(); + if UnionBranchCancellation.action(&mut r).is_ok() { + assert_shape(&before, &r.typ(), "UnionBranchCancellation", &rel); + assert_same_rows(&baseline, r, "UnionBranchCancellation", &rel); + } + } + + // Structural fusions. Each is a purely local, semantics-preserving rewrite + // applied across the whole tree (pre-order, matching their real drivers). + // None changes the result multiset or the output shape, on any input. + for (who, action) in [ + ( + "FilterFusion", + fusion::filter::Filter::action as fn(&mut MirRelationExpr), + ), + ("ProjectFusion", fusion::project::Project::action), + ("MapFusion", fusion::map::Map::action), + ("NegateFusion", fusion::negate::Negate::action), + ("UnionFusion", fusion::union::Union::action), + ("ProjectionExtraction", ProjectionExtraction::action), + ] { + let mut r = rel.clone(); + let before = r.typ(); + r.visit_pre_mut(action); + assert_shape(&before, &r.typ(), who, &rel); + assert_same_rows(&baseline, r, who, &rel); + } + + // FoldConstants: the evaluator itself must at least preserve shape. + { + let mut r = rel; + let before = r.typ(); + let mut typ = before.clone(); + if (FoldConstants { limit: None }) + .action(&mut r, &mut typ) + .is_ok() + { + assert_shape(&before, &r.typ(), "FoldConstants", &r); + } + } + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let mut u = Unstructured::new(data); + let _ = run(&mut u); +}); diff --git a/src/transform/fuzz/fuzz_targets/optimizer_symbolic_equiv.rs b/src/transform/fuzz/fuzz_targets/optimizer_symbolic_equiv.rs new file mode 100644 index 0000000000000..525d1a5410c09 --- /dev/null +++ b/src/transform/fuzz/fuzz_targets/optimizer_symbolic_equiv.rs @@ -0,0 +1,506 @@ +// Copyright Materialize, Inc. and contributors. All rights reserved. +// +// Use of this software is governed by the Business Source License +// included in the LICENSE file. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0. + +//! Fuzz target: the logical optimizer must preserve results over *symbolic* +//! inputs. `full_optimizer_equiv` builds plans rooted at `Constant`s, so the +//! optimizer constant-folds everything away before the interesting relational +//! planning (join ordering/implementation, predicate and projection pushdown +//! through `Get`s, key inference) ever runs. This target instead roots the plan +//! at `Get`s, opaque relations, exactly what the optimizer sees when planning a +//! real query against catalog objects, so that planning actually happens, while +//! still retaining a ground-truth oracle. +//! +//! Each `Get` is bound (in a side table) to a concrete, fuzzed constant +//! collection. The oracle: +//! +//! 1. `baseline = collapse(substitute(plan))`. Inline each `Get`'s data, then +//! fold to the actual result rows. +//! 2. `optimized = optimize(plan)`. Run the full logical optimizer with the +//! `Get`s still symbolic, so join/pushdown/key planning runs for real. +//! 3. `after = collapse(substitute(optimized))`. Inline the same data into the +//! optimized plan and fold. +//! 4. assert `baseline == after`. +//! +//! `substitute` replaces only the global `Get`s we created. The optimizer's own +//! `Let`/local `Get` bindings (e.g. from CSE) are collapsed by `collapse`, which +//! iterates `FoldConstants` + `NormalizeLets` until the plan reduces to a +//! `Constant`. The comparison is conservative (only asserted when both sides +//! fold, a `Typecheck`/optimizer error is a skip), so a surviving divergence or +//! an optimizer panic is a genuine finding. It covers the symbolic-input +//! planning that the constant-rooted target cannot reach. + +#![no_main] + +use std::collections::BTreeMap; + +use libfuzzer_sys::arbitrary::{self, Arbitrary, Unstructured}; +use libfuzzer_sys::fuzz_target; +use mz_expr::{ + AggregateExpr, AggregateFunc, ColumnOrder, EvalError, Id, MirRelationExpr, MirScalarExpr, func, +}; +use mz_repr::optimize::OptimizerFeatures; +use mz_repr::{Datum, Diff, GlobalId, ReprColumnType, ReprRelationType, ReprScalarType, Row}; +use mz_transform::dataflow::DataflowMetainfo; +use mz_transform::fold_constants::FoldConstants; +use mz_transform::normalize_lets::NormalizeLets; +use mz_transform::{Optimizer, TransformCtx, typecheck}; + +#[derive(Clone, Copy, PartialEq)] +enum Ty { + Int32, + Int64, + Bool, +} + +fn scalar_ty(ty: Ty) -> ReprScalarType { + match ty { + Ty::Int32 => ReprScalarType::Int32, + Ty::Int64 => ReprScalarType::Int64, + Ty::Bool => ReprScalarType::Bool, + } +} + +fn rand_ty(u: &mut Unstructured) -> arbitrary::Result { + Ok(match u.int_in_range(0u8..=2)? { + 0 => Ty::Int32, + 1 => Ty::Int64, + _ => Ty::Bool, + }) +} + +fn gen_datum(u: &mut Unstructured, ty: Ty) -> arbitrary::Result> { + if u.ratio(1u8, 5u8)? { + return Ok(Datum::Null); + } + Ok(match ty { + Ty::Int32 => Datum::Int32(i32::arbitrary(u)?), + Ty::Int64 => Datum::Int64(i64::arbitrary(u)?), + Ty::Bool => { + if bool::arbitrary(u)? { + Datum::True + } else { + Datum::False + } + } + }) +} + +fn cols_of(schema: &[Ty], ty: Ty) -> Vec { + schema + .iter() + .enumerate() + .filter(|(_, t)| **t == ty) + .map(|(i, _)| i) + .collect() +} + +/// A shallow, well-typed scalar expression of type `ty` over `schema`. +fn gen_scalar( + u: &mut Unstructured, + ty: Ty, + schema: &[Ty], + depth: u32, +) -> arbitrary::Result { + let st = scalar_ty(ty); + if depth == 0 || u.ratio(1u8, 2u8)? { + let cols = cols_of(schema, ty); + if !cols.is_empty() && bool::arbitrary(u)? { + let idx = u.int_in_range(0..=cols.len() - 1)?; + return Ok(MirScalarExpr::column(cols[idx])); + } + return Ok(match u.int_in_range(0u8..=2)? { + 0 => MirScalarExpr::literal_ok(gen_datum(u, ty)?, st), + 1 => MirScalarExpr::literal_null(st), + _ => MirScalarExpr::literal(Err(EvalError::DivisionByZero), st), + }); + } + let d = depth - 1; + Ok(match ty { + Ty::Int32 => { + let a = gen_scalar(u, Ty::Int32, schema, d)?; + let b = gen_scalar(u, Ty::Int32, schema, d)?; + match u.int_in_range(0u8..=2)? { + 0 => a.call_binary(b, func::AddInt32), + 1 => a.call_binary(b, func::SubInt32), + _ => a.call_binary(b, func::MulInt32), + } + } + Ty::Int64 => { + let a = gen_scalar(u, Ty::Int64, schema, d)?; + let b = gen_scalar(u, Ty::Int64, schema, d)?; + match u.int_in_range(0u8..=2)? { + 0 => a.call_binary(b, func::AddInt64), + 1 => a.call_binary(b, func::SubInt64), + _ => a.call_binary(b, func::MulInt64), + } + } + Ty::Bool => match u.int_in_range(0u8..=3)? { + 0 => gen_scalar(u, Ty::Bool, schema, d)?.and(gen_scalar(u, Ty::Bool, schema, d)?), + 1 => gen_scalar(u, Ty::Bool, schema, d)?.or(gen_scalar(u, Ty::Bool, schema, d)?), + 2 => gen_scalar(u, Ty::Bool, schema, d)?.not(), + _ => { + let t = rand_ty(u)?; + let a = gen_scalar(u, t, schema, d)?; + let b = gen_scalar(u, t, schema, d)?; + a.call_binary(b, func::Eq) + } + }, + }) +} + +/// A symbolic `Get` leaf bound (in `data`) to a fresh constant collection. +fn gen_get( + u: &mut Unstructured, + next_id: &mut u64, + data: &mut BTreeMap, +) -> arbitrary::Result<(MirRelationExpr, Vec)> { + let ncols = u.int_in_range(1usize..=3)?; + let schema: Vec = (0..ncols) + .map(|_| rand_ty(u)) + .collect::>()?; + let col_types: Vec = schema + .iter() + .map(|t| scalar_ty(*t).nullable(true)) + .collect(); + let typ = ReprRelationType::new(col_types); + let nrows = u.int_in_range(0usize..=4)?; + let mut rows = Vec::with_capacity(nrows); + for _ in 0..nrows { + let mut row = Vec::with_capacity(ncols); + for t in &schema { + row.push(gen_datum(u, *t)?); + } + rows.push(row); + } + let constant = MirRelationExpr::constant(rows, typ.clone()); + + let id = *next_id; + *next_id += 1; + data.insert(id, constant); + + Ok((MirRelationExpr::global_get(GlobalId::User(id), typ), schema)) +} + +/// One aggregate over `schema`, plus the scalar type of its output column. +fn gen_aggregate(u: &mut Unstructured, schema: &[Ty]) -> arbitrary::Result<(AggregateExpr, Ty)> { + let mut opts: Vec<(AggregateFunc, usize, Ty)> = Vec::new(); + for &c in &cols_of(schema, Ty::Int32) { + opts.push((AggregateFunc::MaxInt32, c, Ty::Int32)); + opts.push((AggregateFunc::MinInt32, c, Ty::Int32)); + opts.push((AggregateFunc::SumInt32, c, Ty::Int64)); + } + for &c in &cols_of(schema, Ty::Int64) { + opts.push((AggregateFunc::MaxInt64, c, Ty::Int64)); + opts.push((AggregateFunc::MinInt64, c, Ty::Int64)); + } + for &c in &cols_of(schema, Ty::Bool) { + opts.push((AggregateFunc::Any, c, Ty::Bool)); + opts.push((AggregateFunc::All, c, Ty::Bool)); + } + opts.push((AggregateFunc::Count, 0, Ty::Int64)); + + let idx = u.int_in_range(0..=opts.len() - 1)?; + let (func, col, out) = opts[idx].clone(); + Ok(( + AggregateExpr { + func, + expr: MirScalarExpr::column(col), + distinct: bool::arbitrary(u)?, + }, + out, + )) +} + +fn gen_rel( + u: &mut Unstructured, + depth: u32, + next_id: &mut u64, + data: &mut BTreeMap, +) -> arbitrary::Result<(MirRelationExpr, Vec)> { + if depth == 0 || u.ratio(2u8, 5u8)? { + return gen_get(u, next_id, data); + } + let (inner, schema) = gen_rel(u, depth - 1, next_id, data)?; + let arity = schema.len(); + Ok(match u.int_in_range(0u8..=9)? { + 0 => { + let n = u.int_in_range(1usize..=2)?; + let preds = (0..n) + .map(|_| gen_scalar(u, Ty::Bool, &schema, 2)) + .collect::>>()?; + (inner.filter(preds), schema) + } + 1 => { + let ty = rand_ty(u)?; + let e = gen_scalar(u, ty, &schema, 2)?; + let mut s = schema.clone(); + s.push(ty); + (inner.map(vec![e]), s) + } + 2 => { + let k = u.int_in_range(1usize..=arity)?; + let mut outputs = Vec::with_capacity(k); + for _ in 0..k { + outputs.push(u.int_in_range(0..=arity - 1)?); + } + let s = outputs.iter().map(|&i| schema[i]).collect(); + (inner.project(outputs), s) + } + 3 => (inner.negate(), schema), + 4 => (inner.distinct(), schema), + 5 => (inner.threshold(), schema), + 6 => { + let other = if bool::arbitrary(u)? { + inner.clone() + } else { + inner.clone().negate() + }; + (inner.union(other), schema) + } + // Join 2-4 symbolic inputs with multiple equi-join equivalence classes + // chaining inputs together (e.g. `in0.x = in1.x` and `in1.y = in2.y`). + // With the `Get`s left symbolic, this is what exercises join + // ordering/implementation selection, equality propagation across inputs, + // and predicate/projection pushdown into each `Get`. + 7 => { + let n_extra = u.int_in_range(1usize..=3)?; + let mut inputs = vec![inner]; + let mut input_schemas = vec![schema.clone()]; + for _ in 0..n_extra { + let (other, oschema) = gen_rel(u, depth - 1, next_id, data)?; + input_schemas.push(oschema); + inputs.push(other); + } + let mut variables: Vec> = Vec::new(); + for r in 1..inputs.len() { + for ty in [Ty::Int32, Ty::Int64, Ty::Bool] { + let rc = cols_of(&input_schemas[r], ty); + if rc.is_empty() || !bool::arbitrary(u)? { + continue; + } + let candidates: Vec = (0..r) + .filter(|&l| !cols_of(&input_schemas[l], ty).is_empty()) + .collect(); + if candidates.is_empty() { + continue; + } + let l = candidates[u.int_in_range(0..=candidates.len() - 1)?]; + let lc = cols_of(&input_schemas[l], ty); + let li = lc[u.int_in_range(0..=lc.len() - 1)?]; + let rj = rc[u.int_in_range(0..=rc.len() - 1)?]; + variables.push(vec![(l, li), (r, rj)]); + } + } + let mut s = schema.clone(); + for os in &input_schemas[1..] { + s.extend(os.iter().copied()); + } + (MirRelationExpr::join(inputs, variables), s) + } + 8 => { + let mut group_key = Vec::new(); + for c in 0..arity { + if bool::arbitrary(u)? { + group_key.push(c); + } + } + let n_agg = u.int_in_range(0usize..=2)?; + let mut aggregates = Vec::with_capacity(n_agg); + let mut out: Vec = group_key.iter().map(|&k| schema[k]).collect(); + for _ in 0..n_agg { + let (a, t) = gen_aggregate(u, &schema)?; + aggregates.push(a); + out.push(t); + } + if group_key.is_empty() && aggregates.is_empty() { + aggregates.push(AggregateExpr { + func: AggregateFunc::Count, + expr: MirScalarExpr::column(0), + distinct: false, + }); + out.push(Ty::Int64); + } + (inner.reduce(group_key, aggregates, None), out) + } + _ => { + let mut group_key = Vec::new(); + for c in 0..arity { + if u.ratio(1u8, 3u8)? { + group_key.push(c); + } + } + // Total order (every column) so LIMIT/OFFSET is deterministic. + let mut order_key = Vec::with_capacity(arity); + for column in 0..arity { + order_key.push(ColumnOrder { + column, + desc: bool::arbitrary(u)?, + nulls_last: bool::arbitrary(u)?, + }); + } + let limit = if bool::arbitrary(u)? { + Some(MirScalarExpr::literal_ok( + Datum::Int64(u.int_in_range(0i64..=3)?), + ReprScalarType::Int64, + )) + } else { + None + }; + let offset = u.int_in_range(0usize..=2)?; + ( + inner.top_k(group_key, order_key, limit, offset, None), + schema, + ) + } + }) +} + +/// Replace every global `Get` we created with its bound constant collection. +/// Local `Get`s (introduced by the optimizer's `Let`s) are left for `collapse`. +fn substitute(mut rel: MirRelationExpr, data: &BTreeMap) -> MirRelationExpr { + rel.visit_pre_mut(|e| { + let replacement = match e { + MirRelationExpr::Get { + id: Id::Global(GlobalId::User(uid)), + .. + } => data.get(&*uid).cloned(), + _ => None, + }; + if let Some(c) = replacement { + *e = c; + } + }); + rel +} + +/// Outcome of trying to fold a (`Get`-free) plan all the way to a `Constant`. +enum Collapse { + /// Reduced to a `Constant` of `Ok` rows. The consolidated `(row, diff)` + /// multiset is the actual result. + Const(BTreeMap), + /// Reached a fixpoint of `FoldConstants` + `NormalizeLets` (applying them no + /// longer changes the plan) that is *not* a constant, e.g. the plan errors, + /// or folding genuinely cannot evaluate it. This is a legitimate + /// fold-limitation skip, not a coverage gap. + StuckFixpoint, + /// Hit the iteration budget without reaching either a constant or a + /// fixpoint. The plan was still simplifying when we ran out of passes. Kept + /// distinct from `StuckFixpoint` only to name the two skip reasons. + /// `FoldConstants` does not promise a constant input collapses to a + /// `Constant` within any limit, so this is a conservative skip too. + BudgetExhausted, +} + +/// Fold a (now `Get`-free) plan to a `Constant` by iterating `FoldConstants` + +/// `NormalizeLets` (to collapse any `Let`s the optimizer's CSE introduced) until +/// it either becomes a `Constant`, reaches a fixpoint, or exhausts the budget. +/// +/// This loops to a genuine fixpoint (stops only when a pass leaves the plan +/// unchanged), so a plan that just needs a few more passes converges rather than +/// being dropped. The budget is a generous guard against a non-terminating +/// rewrite. +fn collapse(mut rel: MirRelationExpr) -> Collapse { + let features = OptimizerFeatures::default(); + const BUDGET: usize = 64; + for _ in 0..BUDGET { + let before = rel.clone(); + let mut typ = rel.typ(); + if (FoldConstants { limit: None }) + .action(&mut rel, &mut typ) + .is_err() + { + return Collapse::StuckFixpoint; + } + if rel.as_const().is_some() { + break; + } + if NormalizeLets::new(true) + .action(&mut rel, &features) + .is_err() + { + return Collapse::StuckFixpoint; + } + // A full pass that changed nothing means we will never reach a constant. + if rel == before { + return Collapse::StuckFixpoint; + } + } + let Some(constant) = rel.as_const() else { + // Still simplifying when the budget ran out. + return Collapse::BudgetExhausted; + }; + let (Ok(rows), _) = constant else { + return Collapse::StuckFixpoint; + }; + let mut multiset: BTreeMap = BTreeMap::new(); + for (row, diff) in rows { + *multiset.entry(row.clone()).or_insert(Diff::ZERO) += *diff; + } + multiset.retain(|_, d| *d != Diff::ZERO); + Collapse::Const(multiset) +} + +/// Run the full logical optimizer. `None` if it errors (e.g. `Typecheck`). +#[allow(deprecated)] +fn optimize(rel: MirRelationExpr) -> Option { + let features = OptimizerFeatures::default(); + let typecheck_ctx = typecheck::empty_typechecking_context(); + let mut df_meta = DataflowMetainfo::default(); + let mut ctx = TransformCtx::local( + &features, + &typecheck_ctx, + &mut df_meta, + None, + Some(GlobalId::Transient(1)), + ); + let optimizer = Optimizer::logical_optimizer(&mut ctx); + optimizer + .optimize(rel, &mut ctx) + .ok() + .map(|o| o.into_inner()) +} + +fn run(u: &mut Unstructured) -> arbitrary::Result<()> { + let mut next_id = 0u64; + let mut data = BTreeMap::new(); + let (plan, _schema) = gen_rel(u, 3, &mut next_id, &mut data)?; + + // Ground truth: inline the data into the input plan and fold. Only proceed + // when the *input* (which has no optimizer-introduced `Let`s) folds to a + // constant, that is what gives us a result to compare against. + let baseline = match collapse(substitute(plan.clone(), &data)) { + Collapse::Const(b) => b, + Collapse::StuckFixpoint | Collapse::BudgetExhausted => return Ok(()), + }; + + // Optimize with the Gets still symbolic, then inline the same data and fold. + let Some(optimized) = optimize(plan.clone()) else { + return Ok(()); + }; + match collapse(substitute(optimized, &data)) { + Collapse::Const(after) => assert_eq!( + baseline, after, + "optimizer changed the result over symbolic inputs\nplan = {plan:?}\ndata = {data:?}" + ), + // The optimized plan did not fold to a constant: either a non-constant + // fixpoint (an operator `FoldConstants` cannot evaluate) or still + // simplifying when the 64-pass budget ran out. `FoldConstants` does not + // promise a constant input reduces to a `Constant` within a limit, and + // the optimizer legitimately reshapes plans (CSE into `Let` nesting) + // into forms this two-pass loop may not drive to a fixpoint here. Both + // are conservative skips, not divergences, matching `full_optimizer_equiv`. + Collapse::StuckFixpoint | Collapse::BudgetExhausted => {} + } + Ok(()) +} + +fuzz_target!(|data: &[u8]| { + let mut u = Unstructured::new(data); + let _ = run(&mut u); +}); diff --git a/test/cargo-fuzz/.gitignore b/test/cargo-fuzz/.gitignore new file mode 100644 index 0000000000000..2c96eb1b6517f --- /dev/null +++ b/test/cargo-fuzz/.gitignore @@ -0,0 +1,2 @@ +target/ +Cargo.lock diff --git a/test/cargo-fuzz/Cargo.toml b/test/cargo-fuzz/Cargo.toml new file mode 100644 index 0000000000000..d0f62d2ebded9 --- /dev/null +++ b/test/cargo-fuzz/Cargo.toml @@ -0,0 +1,75 @@ +# Shared workspace for every cargo-fuzz crate (`src/*/fuzz`). +# +# The fuzz crates are excluded from the root workspace because libFuzzer +# requires a nightly toolchain and would break stable builds. They all share +# THIS workspace rather than each being its own. A per-crate workspace has to +# duplicate the parts of the root `Cargo.toml`'s `[patch.crates-io]` it needs, +# which silently drifts out of sync with the root (e.g. a stale `iceberg-rust` +# rev) and breaks builds. Sharing one workspace means a single +# `[patch.crates-io]` to keep in sync with the root, one `Cargo.lock`, and no +# per-crate duplication. Each fuzz crate points here via +# `package.workspace = "../../../test/cargo-fuzz"`. +# +# `[patch.crates-io]` below is the SUBSET of the root `Cargo.toml`'s patches that +# the fuzz crates' dependency graph actually uses (root entries with no consumer +# here, currently `duckdb` and `postgres_array`, are omitted to avoid cargo's +# "patch was not used in the crate graph" warnings). Every entry that IS present +# must match the root's rev/version verbatim. If a fuzz crate gains a dependency +# that needs another patched crate, copy that entry from the root. + +[workspace] +resolver = "2" +members = [ + "../../src/avro/fuzz", + "../../src/catalog-protos/fuzz", + "../../src/expr/fuzz", + "../../src/interchange/fuzz", + "../../src/mysql-util/fuzz", + "../../src/persist-client/fuzz", + "../../src/pgcopy/fuzz", + "../../src/pgrepr/fuzz", + "../../src/pgtz/fuzz", + "../../src/pgwire/fuzz", + "../../src/postgres-util/fuzz", + "../../src/repr/fuzz", + "../../src/sql-parser/fuzz", + "../../src/sql-server-util/fuzz", + "../../src/storage-types/fuzz", + "../../src/storage/fuzz", + "../../src/transform/fuzz", +] + +[patch.crates-io] +# Waiting on https://github.com/sfackler/rust-postgres/pull/752. +postgres = { git = "https://github.com/MaterializeInc/rust-postgres" } +tokio-postgres = { git = "https://github.com/MaterializeInc/rust-postgres" } +postgres-protocol = { git = "https://github.com/MaterializeInc/rust-postgres" } +postgres-replication = { git = "https://github.com/MaterializeInc/rust-postgres" } +postgres-types = { git = "https://github.com/MaterializeInc/rust-postgres" } +postgres-openssl = { git = "https://github.com/MaterializeInc/rust-postgres" } + +# Waiting on https://github.com/MaterializeInc/serde-value/pull/35. +serde-value = { git = "https://github.com/MaterializeInc/serde-value.git" } + +# Waiting for resolution of https://github.com/launchdarkly/rust-server-sdk/issues/116 +launchdarkly-server-sdk = { git = "https://github.com/MaterializeInc/rust-server-sdk", rev = "3e0a0b98b09a2970f292577a07e1c9382b65b5da" } + +# Waiting on https://github.com/edenhill/librdkafka/pull/4051. +rdkafka = { git = "https://github.com/MaterializeInc/rust-rdkafka.git" } +rdkafka-sys = { git = "https://github.com/MaterializeInc/rust-rdkafka.git" } + +# Need to upstream a few PRs related to test builders. +# +# Note: All changes in our fork of tiberius should be pushed to the `mz_changes` branch. +tiberius = { git = "https://github.com/MaterializeInc/tiberius", rev="64ca594cc22ed67d072c2d0110455da50539e1cd" } + +# Allows us to use bzip2-sys rather than the rust reimpl. +# All changes should go to the `mz_changes` branch. +# Once https://github.com/Nullus157/async-compression/pull/364 lands we can remove this. +async-compression = { git = "https://github.com/MaterializeInc/async-compression.git", rev = "fe7411eb6104a02a89e2c3a76ab326dd6594214d" } + +# Custom iceberg features for mz +# All changes should go to the `mz_v0.9.0` branch. +iceberg = { git = "https://github.com/MaterializeInc/iceberg-rust.git", rev = "dedd9231ee88ee979b648e14792878b40e74c20a" } +iceberg-catalog-rest = { git = "https://github.com/MaterializeInc/iceberg-rust.git", rev = "dedd9231ee88ee979b648e14792878b40e74c20a" } +iceberg-storage-opendal = { git = "https://github.com/MaterializeInc/iceberg-rust.git", rev = "dedd9231ee88ee979b648e14792878b40e74c20a" } diff --git a/test/cargo-fuzz/mzcompose.py b/test/cargo-fuzz/mzcompose.py new file mode 100644 index 0000000000000..79b6aeec1f568 --- /dev/null +++ b/test/cargo-fuzz/mzcompose.py @@ -0,0 +1,1069 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +import base64 +import os +import platform +import shlex +import shutil +import signal +import subprocess +import time +from dataclasses import dataclass, field +from pathlib import Path +from typing import IO + +from materialize import MZ_ROOT, buildkite, ui +from materialize.mzcompose.composition import Composition, WorkflowArgumentParser + +SERVICES = [] + +# Buildkite artifact that carries the minimized corpus between release-qualification runs so +# coverage accumulates instead of restarting from the seeds every time. +CORPUS_ARTIFACT = "fuzz-corpus.tar.zst" + +# Every fuzz crate. Keep in sync with the crates that have a `fuzz/` +# subdirectory. `bin/ci-builder` and the release-qualification pipeline build the same set. +FUZZ_CRATES = [ + "src/sql-parser/fuzz", + "src/expr/fuzz", + "src/transform/fuzz", + "src/repr/fuzz", + "src/storage-types/fuzz", + "src/catalog-protos/fuzz", + "src/avro/fuzz", + "src/mysql-util/fuzz", + "src/postgres-util/fuzz", + "src/sql-server-util/fuzz", + "src/pgwire/fuzz", + "src/pgrepr/fuzz", + "src/pgcopy/fuzz", + "src/pgtz/fuzz", + "src/interchange/fuzz", + "src/persist-client/fuzz", + "src/storage/fuzz", +] + +# The highest-yield targets, the ones that keep surfacing bugs deep into a run, +# or that guard a bug-prone path / actively-developed subsystem where a find +# would be catastrophic. `--profile fruitful` restricts the run to these, which +# is the right focus for the long (24h) release-qualification run that should +# spend its cores where bugs still hide. Substring-matched against +# `crate::target`, like the positional `filters`. +# +# This set is pruned by productivity. Targets over well-tested, stable code that +# fuzz clean round after round (the arithmetic/range oracles, internal +# encode/decode round-trips, simple jsonb access) are dropped, since they've +# saturated, and their cores are better spent elsewhere. They still exist and can +# be run by name or re-added if the code under them changes. Two buckets stay +# despite no recent finds: the untrusted-input parsers/decoders (unbounded +# adversarial input) and the optimizer/upsert/persist targets (regression +# insurance on code that changes often). Revisit as the productive set shifts. +FRUITFUL = [ + # SQL-parser round-trip oracle, ~60% of all findings. A single target: + # structured statement generation plus a full-vocabulary soup minority, + # one `parse -> print -> reparse` AST-equality oracle. + "sql_roundtrip", + # Hand-written parsers/decoders of untrusted text/bytes (COPY, wire params, + # Kafka payloads). Every decoder bug so far came from this bucket. + "strconv_parse_timestamp", + "strconv_parse_timestamptz", + "strconv_parse_date", + "strconv_parse_time", + "strconv_parse_bytes", + "strconv_parse_uuid", + "rollup_proto_roundtrip", + "copy_decode", + "protobuf_decode_fuzzed_schema", + "json_encode", + "avro_decode_fuzzed_schema", + "csv_decode", + "schema_resolve", + "avro_schema_parse", + "timezone_parse", + "like_pattern_compile", + "like_pattern_escape", + "build_regex", + "cast_string", + # Clean so far, but kept as regression insurance on actively-developed code + # where a miscompilation / corruption would be catastrophic. + "mir_scalar_reduce", + "mir_relation_transforms", + "full_optimizer_equiv", + "optimizer_symbolic_equiv", + "mfp_optimize", + "upsert_consolidate", + "upsert_value_roundtrip_v2", + "upsert_state_consolidate", + "upsert_runtime", + "row_arrow_roundtrip", +] + +say = ui.speaker("") + + +@dataclass +class Job: + crate: str # e.g. "src/avro/fuzz" + target: str # e.g. "reader_decode" + log_path: Path + proc: subprocess.Popen | None = None + started_at: float = 0.0 + finished_at: float = 0.0 + returncode: int | None = None + + @property + def name(self) -> str: + # e.g. "avro::reader_decode", short crate name + target. + crate_name = Path(self.crate).parent.name + return f"{crate_name}::{self.target}" + + @property + def artifact_dir(self) -> Path: + return MZ_ROOT / self.crate / "artifacts" / self.target + + def elapsed(self) -> float: + end = self.finished_at or time.time() + return end - self.started_at + + +@dataclass +class _CminProc: + """An in-flight `-merge=1` minimization for one target.""" + + job: Job + proc: subprocess.Popen + log: IO[str] + before: int # corpus size before the merge + merged: Path # scratch dir the merge writes into, swapped in on success + + +def fuzz_env(target_dir: Path) -> dict[str, str]: + env = dict(os.environ) + # One shared target dir for every fuzz crate (see module docstring). + env["CARGO_TARGET_DIR"] = str(target_dir) + # Use every core for compilation. Force it explicitly: a `CARGO_BUILD_JOBS=1` + # inherited from the environment (handy when running many fuzzers at once) + # would otherwise throttle the build phase to a single thread. + env["CARGO_BUILD_JOBS"] = str(os.cpu_count() or 1) + # cargo-fuzz requires a nightly toolchain. Default to nightly unless the + # caller already pinned one (the nightly ci-builder flavor sets it). + env.setdefault("RUSTUP_TOOLCHAIN", "nightly") + if platform.system() == "Darwin": + # cargo-fuzz instruments every crate with SanCov coverage, but some + # dependencies build a `cdylib` (e.g. crc-fast, pulled in via + # aws-smithy-checksums). A cdylib links on its own without libFuzzer's + # coverage runtime, so it fails with undefined `__sanitizer_cov_*` + # symbols. Use the classic linker (the new one rejects these flags) and + # defer those symbols to runtime via `-undefined dynamic_lookup`, where + # the fuzz binary's own runtime supplies them. Disabling the nano malloc + # zone keeps ASan-style allocators happy. + extra = "-C link-arg=-Wl,-ld_classic -C link-arg=-Wl,-undefined,dynamic_lookup" + rustflags = env.get("RUSTFLAGS", "") + if "dynamic_lookup" not in rustflags: + env["RUSTFLAGS"] = f"{rustflags} {extra}".strip() + env.setdefault("MallocNanoZone", "0") + return env + + +def host_triple(env: dict[str, str]) -> str: + """The Rust host target triple, e.g. `x86_64-unknown-linux-gnu`. + + cargo-fuzz builds each target to `//release/`, + so we need the triple to locate the binaries we exec directly. + """ + out = subprocess.run( + ["rustc", "-vV"], env=env, check=True, capture_output=True, text=True + ) + for line in out.stdout.splitlines(): + if line.startswith("host: "): + return line.removeprefix("host: ").strip() + raise ui.UIError("could not determine host target triple from `rustc -vV`") + + +def list_targets(crate: str, env: dict[str, str]) -> list[str]: + # Listing is cheap and doesn't need the corpus, so it runs for every crate. + # `prepare_corpus` is then called only for the crates this shard owns. + crate_dir = MZ_ROOT / crate + out = subprocess.run( + ["cargo", "fuzz", "list"], + cwd=crate_dir, + env=env, + check=True, + capture_output=True, + text=True, + ) + return [line.strip() for line in out.stdout.splitlines() if line.strip()] + + +def prepare_corpus(crate: str, env: dict[str, str]) -> None: + """Seed a crate's corpus by running its `prepare-corpus.sh`, if it has one.""" + crate_dir = MZ_ROOT / crate + prepare = crate_dir / "prepare-corpus.sh" + if prepare.is_file() and os.access(prepare, os.X_OK): + subprocess.run([str(prepare)], cwd=crate_dir, env=env, check=True) + + +def dict_for(job: "Job") -> str | None: + """Resolve a libFuzzer dictionary (`-dict`) for a target. + + Dictionaries inject "interesting" tokens (magic bytes, keywords, protobuf + field tags) so the mutator builds structurally-valid input faster. Lookup + order: a per-target `.dict`, then a per-crate `corpus.dict`, then a + shared `proto.dict` for the protobuf round-trip targets. + """ + crate_dir = MZ_ROOT / job.crate + for candidate in (crate_dir / f"{job.target}.dict", crate_dir / "corpus.dict"): + if candidate.is_file(): + return str(candidate) + if "proto" in job.target: + shared = MZ_ROOT / "test" / "cargo-fuzz" / "proto.dict" + if shared.is_file(): + return str(shared) + return None + + +def tail(path: Path, lines: int = 1) -> str: + try: + content = path.read_text(errors="replace").splitlines() + except FileNotFoundError: + return "(no output)" + nonblank = [ln for ln in content if ln.strip()] + if not nonblank: + return "(no output)" + return "\n".join(nonblank[-lines:]) + + +# libFuzzer's end-of-run `stat::` keys, in display order, mapped to short labels. +_FINAL_STAT_LABELS = { + "number_of_executed_units": "execs", + "average_exec_per_sec": "exec/s", + "new_units_added": "new_units", + "slowest_unit_time_sec": "slowest_s", + "peak_rss_mb": "rss_mb", +} + + +def final_stats(path: Path) -> str: + """Compact one-line summary of libFuzzer's end-of-run `stat::` lines. + + libFuzzer prints e.g. `stat::peak_rss_mb: 71` for a handful of keys when a + run ends cleanly (max_total_time reached). Surface them all, not just the + last line. Fall back to the last log line if no stats were emitted. + """ + try: + content = path.read_text(errors="replace") + except FileNotFoundError: + return "(no output)" + found: dict[str, str] = {} + for line in content.splitlines(): + line = line.strip() + if not line.startswith("stat::") or ":" not in line[6:]: + continue + key, _, value = line[6:].partition(":") + label = _FINAL_STAT_LABELS.get(key.strip()) + if label: + found[label] = value.strip() + if not found: + return tail(path) + return " ".join( + f"{label}={found[label]}" + for label in _FINAL_STAT_LABELS.values() + if label in found + ) + + +def _escape_bytes(data: bytes) -> str: + """Render bytes readable: printable ASCII as-is, everything else escaped.""" + special = {0x09: "\\t", 0x0A: "\\n", 0x0D: "\\r", 0x5C: "\\\\"} + out = [] + for b in data: + if b in special: + out.append(special[b]) + elif 0x20 <= b < 0x7F: + out.append(chr(b)) + else: + out.append(f"\\x{b:02x}") + return "".join(out) + + +def crash_input_lines(path: Path, max_bytes: int = 4096) -> list[str]: + """Render a crashing input for the failure annotation: an escaped, readable + form plus base64 (the authoritative, copy-pasteable form). libFuzzer doesn't + always echo the input for `deadly signal` crashes, and the artifact lives on + whichever machine ran the fuzzer, so inline it here for easy reproduction: + `echo | base64 -d > input && cargo fuzz run input`. + """ + try: + data = path.read_bytes() + except OSError: + return [] + shown = data[:max_bytes] + lines = [ + f"input ({len(data)} bytes): {_escape_bytes(shown)}", + # Base64 the full input, not just the escaped preview, so the reproduce + # recipe is exact even when the preview above is truncated. + f"input base64: {base64.b64encode(data).decode('ascii')}", + ] + if len(data) > max_bytes: + lines.append( + f" (escaped preview shows the first {max_bytes} bytes; the base64 is " + f"the full input, also saved as the artifact)" + ) + return lines + + +@dataclass +class FuzzRunner: + jobs: list[Job] + env: dict[str, str] + max_seconds: int + rss_limit_mb: int + jobs_per_target: int + max_parallel: int + fail_fast: bool + triple: str = "" + # None => don't pass --sanitizer (use cargo-fuzz's default, i.e. ASan). + # The CLI defaults this to "none" (see below): our targets find panics / + # round-trip drifts, not memory-corruption bugs, so ASan adds no detection + # power here but ~2-3x slowdown. Pass --sanitizer=address to opt back in. + sanitizer: str | None = None + wall_budget: int = 0 + minimize: bool = True + # Cap the post-fuzz minimize phase in seconds (0 = unbounded) so it and the + # corpus upload after it finish inside the gap between `--wall-budget` and + # the step's hard timeout. See `_minimize`. + minimize_timeout: int = 0 + start: float = field(default_factory=time.time) + pending: list[Job] = field(default_factory=list) + running: list[Job] = field(default_factory=list) + failed: list[Job] = field(default_factory=list) + succeeded: list[Job] = field(default_factory=list) + + def _spawn(self, job: Job) -> None: + # Exec the libFuzzer binary directly rather than going through + # `cargo fuzz run`. The binary was already produced by `build()`, and + # invoking cargo again for each of the ~25 targets just makes them + # queue on cargo's per-target-dir build lock ("Blocking waiting for + # file lock"), serializing what should run in parallel. + target_dir = Path(self.env["CARGO_TARGET_DIR"]) + binary = target_dir / self.triple / "release" / job.target + corpus = MZ_ROOT / job.crate / "corpus" / job.target + corpus.mkdir(parents=True, exist_ok=True) + job.artifact_dir.mkdir(parents=True, exist_ok=True) + cmd = [ + str(binary), + # Trailing slash: libFuzzer writes -. + f"-artifact_prefix={job.artifact_dir}/", + f"-rss_limit_mb={self.rss_limit_mb}", + "-print_final_stats=1", + ] + if (dict_path := dict_for(job)) is not None: + cmd.append(f"-dict={dict_path}") + if self.jobs_per_target > 1: + # libFuzzer fork mode: N worker processes for this one target. + cmd.append(f"-fork={self.jobs_per_target}") + if self.max_seconds > 0: + cmd.append(f"-max_total_time={self.max_seconds}") + cmd.append("-timeout=300") + cmd.append(str(corpus)) + log = job.log_path.open("w") + job.started_at = time.time() + # start_new_session => own process group, so Ctrl-C teardown can kill + # the libFuzzer process even if it forked workers. + job.proc = subprocess.Popen( + cmd, + cwd=MZ_ROOT / job.crate, + env=self.env, + stdout=log, + stderr=subprocess.STDOUT, + start_new_session=True, + ) + self.running.append(job) + say(f"start {job.name} (pid {job.proc.pid}, log {job.log_path})") + + def _fill_slots(self) -> None: + limit = self.max_parallel or len(self.jobs) + while self.pending and len(self.running) < limit: + self._spawn(self.pending.pop(0)) + + def _new_artifacts(self, job: Job) -> list[str]: + """Crash/oom/timeout/leak artifacts this run produced (mtime >= start). + + Checked alongside the exit code because the two cover different run + modes. A single libFuzzer process exits non-zero on a crash. Fork mode + (`-fork=N`, used only when `--jobs` resolves to more than one worker per + target) is crash-resilient instead: it writes the artifact and keeps + fuzzing, exiting 0 at the time limit. `_reap` fails the job on a non-zero + exit OR a new artifact, so neither mode reports a crash as green. Stale + artifacts from earlier runs are excluded by the mtime check. + """ + if not job.artifact_dir.is_dir(): + return [] + return sorted( + p.name + for p in job.artifact_dir.iterdir() + # `slow-unit-*` is deliberately excluded: libFuzzer writes it for any + # input slower than `-report_slow_units` (10s default) while exiting + # 0, so a bounded-but-slow input is not a crash and must not fail the + # run. Only genuine defect artifacts count. + if p.name.startswith(("crash-", "oom-", "timeout-", "leak-")) + and not p.name.endswith(".repro.txt") + and p.stat().st_mtime >= job.started_at + ) + + def _reap(self, job: Job) -> None: + assert job.proc is not None + job.returncode = job.proc.returncode + job.finished_at = time.time() + self.running.remove(job) + secs = int(job.elapsed()) + # A zero exit does NOT mean "no crash": fork mode writes the crash + # artifact and keeps fuzzing, exiting 0 at the time limit. Fail the job if + # it exited non-zero OR produced any new crash artifact. + if job.returncode == 0 and not self._new_artifacts(job): + self.succeeded.append(job) + say(f"✓ {job.name} [{secs}s] {final_stats(job.log_path)}") + else: + self.failed.append(job) + say(self._failure_block(job, secs)) + + def _repro_env_prefix(self) -> str: + """The `fuzz_env` vars that shape the build, as a shell `VAR=val …` + prefix. Prepended to the reproduce command so `cargo fuzz run` reuses the + binary we already built in the shared target dir instead of recompiling + into cargo-fuzz's default `/fuzz/target`. Matching CARGO_TARGET_DIR, + the toolchain, and RUSTFLAGS keeps the build fingerprint identical (the + --sanitizer flag, the other build input, is added on the command itself). + """ + parts = [f"CARGO_TARGET_DIR={shlex.quote(self.env['CARGO_TARGET_DIR'])}"] + for var in ("RUSTUP_TOOLCHAIN", "RUSTFLAGS", "MallocNanoZone"): + if (val := self.env.get(var)) is not None: + parts.append(f"{var}={shlex.quote(val)}") + return " ".join(parts) + + def _failure_block(self, job: Job, secs: int) -> str: + # Emit a predictable, self-contained block. `bin/ci-annotate-errors` + # matches the START/END markers (see CARGO_FUZZ_FAILURE in + # ci_annotate_errors.py) and turns the whole block into a Buildkite + # annotation, so we deliberately do NOT call `buildkite-agent` here. + # + # Only list artifacts this run produced. Stale crash-* files from earlier + # runs would otherwise pollute every annotation. + artifacts = self._new_artifacts(job) + repro = artifacts[0] if artifacts else "" + lines = [ + "---------- CARGO-FUZZ FAILURE START ----------", + f"target: {job.name} ({job.crate})", + f"exit code: {job.returncode} (after {secs}s)", + ] + if artifacts: + lines.append(f"new artifacts: {', '.join(artifacts)}") + san = f"--sanitizer={self.sanitizer} " if self.sanitizer else "" + lines.append( + f"reproduce: cd {job.crate} && {self._repro_env_prefix()} " + f"cargo fuzz run {san}{job.target} artifacts/{job.target}/{repro}" + ) + # Inline the crashing input so it can be reproduced straight from the + # annotation, without fetching the artifact from the machine that ran it. + if artifacts: + lines += crash_input_lines(job.artifact_dir / repro) + lines.append("last output:") + lines += [f" {ln}" for ln in tail(job.log_path, 8).splitlines()] + lines.append("---------- CARGO-FUZZ FAILURE END ----------") + block = "\n".join(lines) + # Persist the same record next to the crashing artifact. stdout and the + # CI annotation are ephemeral, so without this a later triage only has + # the bare input file and must re-run the target to recover the cause. + # The `.repro.txt` sidecar keeps the artifacts dir self-describing. + if artifacts: + try: + (job.artifact_dir / f"{repro}.repro.txt").write_text(block + "\n") + except OSError as e: + say(f" (could not write repro sidecar for {repro}: {e})") + return block + + def _terminate_all(self, sig: int) -> None: + for job in self.running: + if job.proc and job.proc.poll() is None: + try: + os.killpg(os.getpgid(job.proc.pid), sig) + except ProcessLookupError: + pass + + def build(self) -> None: + # Build every target up front, one crate at a time, so the concurrent + # fuzzing phase doesn't have 20+ `cargo fuzz run` invocations fighting + # over cargo's per-target-dir build lock. Crates share the target dir, + # so common dependencies compile once. + # Must match _spawn's exec: we build with this sanitizer and run that + # exact binary. Defaults to `none` everywhere (see the CLI flag): on + # macOS ASan's interceptors segfault on startup (e.g. in flockfile), and + # on Linux ASan only adds ~2-3x slowdown without catching our bug class + # (panics / round-trip drifts, not memory corruption). Pass + # --sanitizer=address to opt back into ASan's memory coverage. + cmd = ["cargo", "fuzz", "build"] + if self.sanitizer: + cmd.append(f"--sanitizer={self.sanitizer}") + crates = sorted({job.crate for job in self.jobs}) + for i, crate in enumerate(crates, 1): + say(f"building [{i}/{len(crates)}] {crate}") + if subprocess.run(cmd, cwd=MZ_ROOT / crate, env=self.env).returncode != 0: + # A fuzz target that won't compile is a broken build, not + # something to fuzz around: fail the whole run immediately rather + # than dropping the crate's targets and reporting green. + raise ui.UIError(f"build FAILED for {crate}") + + def run(self) -> list[Job]: + # Clamp the per-target budget so a slow build can't blow the step + # timeout: whatever wall time the build already consumed is subtracted. + if self.max_seconds > 0 and self.wall_budget > 0: + remaining = self.wall_budget - (time.time() - self.start) + if remaining < 60: + say( + f"build consumed the wall budget ({self.wall_budget}s); " + f"only {int(remaining)}s left, skipping the fuzz phase" + ) + return [] + self.max_seconds = min(self.max_seconds, int(remaining)) + self.pending = list(self.jobs) + say( + f"launching {len(self.pending)} target(s); " + f"{'continuous (Ctrl-C to stop)' if self.max_seconds == 0 else f'{self.max_seconds}s each'}; " + f"target dir {self.env['CARGO_TARGET_DIR']}" + ) + try: + self._fill_slots() + while self.running: + for job in list(self.running): + assert job.proc is not None + if job.proc.poll() is not None: + self._reap(job) + if self.fail_fast and self.failed: + say("--fail-fast: a target crashed, stopping the rest") + break + self._fill_slots() + time.sleep(1.0) + except KeyboardInterrupt: + say("\nCtrl-C: stopping all running targets…") + finally: + self._shutdown() + # Minimize after fuzzing stops (on normal completion, on the + # per-target timeout, and on Ctrl-C) so the corpus we keep around + # for the next run holds only coverage-increasing inputs. + if self.minimize: + self._minimize() + self._summary() + return self.failed + + def _minimize(self) -> None: + # Minimize every corpus concurrently, with the same slot cap the fuzz + # phase uses. Each `-merge=1` is a separate process over its own corpus, + # so running them one at a time wastes ~N cores for the whole phase. + # + # Minimize and the corpus upload that follows it run after the fuzz + # phase and outside `--wall-budget`, in the gap between it and the step's + # hard timeout. A parallel `-merge` over corpora grown for the whole run + # can be slow, so `--minimize-timeout` caps this phase: past the deadline + # we stop launching merges and kill the running ones, leaving the rest of + # the gap for the upload. Killing mid-merge never loses data, since + # `_cmin_finish` swaps the minimized corpus in only on a clean exit. + target_dir = Path(self.env["CARGO_TARGET_DIR"]) + limit = self.max_parallel or len(self.jobs) + deadline = ( + time.time() + self.minimize_timeout if self.minimize_timeout > 0 else None + ) + cap = f", ≤{self.minimize_timeout}s" if deadline else "" + say( + f"minimizing {len(self.jobs)} corpora in parallel " + f"(cargo fuzz cmin, up to {limit} at once{cap})…" + ) + pending = list(self.jobs) + running: list[_CminProc] = [] + + def fill() -> None: + while pending and len(running) < limit: + if deadline is not None and time.time() > deadline: + pending.clear() + return + job = pending.pop(0) + try: + entry = self._cmin_spawn(job, target_dir) + except Exception as e: # don't let one target abort the rest + say(f" cmin {job.name}: failed to start ({e})") + continue + if entry is not None: + running.append(entry) + + try: + fill() + while running: + if deadline is not None and time.time() > deadline: + say( + f"minimize hit its {self.minimize_timeout}s cap, stopping " + f"(corpora left intact, {len(pending)} not minimized)" + ) + pending.clear() + self._kill_cmins(running) + break + for entry in list(running): + if entry.proc.poll() is not None: + running.remove(entry) + try: + self._cmin_finish(entry) + except Exception as e: + say(f" cmin {entry.job.name}: failed ({e})") + fill() + time.sleep(0.5) + except KeyboardInterrupt: + say("Ctrl-C: stopping minimization (corpora left intact)") + self._kill_cmins(running) + + def _kill_cmins(self, running: list["_CminProc"]) -> None: + # Kill whatever's still merging and drop their scratch dirs. The original + # corpora are untouched until a merge succeeds (see `_cmin_finish`), so + # this is always safe. + for entry in running: + if entry.proc.poll() is None: + try: + os.killpg(os.getpgid(entry.proc.pid), signal.SIGKILL) + except ProcessLookupError: + pass + entry.log.close() + shutil.rmtree(entry.merged, ignore_errors=True) + + def _cmin_spawn(self, job: Job, target_dir: Path) -> _CminProc | None: + binary = target_dir / self.triple / "release" / job.target + corpus = MZ_ROOT / job.crate / "corpus" / job.target + if not binary.is_file() or not corpus.is_dir(): + return None + before = sum(1 for _ in corpus.iterdir()) + if before == 0: + return None + # libFuzzer `-merge=1 ` copies into the (empty) dst only the + # inputs from src that add coverage, exactly what `cargo fuzz cmin` + # does, minus the cargo invocation. Swap it in only on success (in + # `_cmin_finish`) so an interrupted merge never loses the corpus. + merged = corpus.with_name(f".{job.target}.cmin") + shutil.rmtree(merged, ignore_errors=True) + merged.mkdir(parents=True) + log = job.log_path.open("a") + proc = subprocess.Popen( + [ + str(binary), + "-merge=1", + f"-rss_limit_mb={self.rss_limit_mb}", + "-timeout=300", + str(merged), + str(corpus), + ], + cwd=MZ_ROOT / job.crate, + env=self.env, + stdout=log, + stderr=subprocess.STDOUT, + # Own process group, so Ctrl-C teardown can kill the merge. + start_new_session=True, + ) + return _CminProc(job=job, proc=proc, log=log, before=before, merged=merged) + + def _cmin_finish(self, entry: _CminProc) -> None: + entry.log.close() + job, before, merged = entry.job, entry.before, entry.merged + corpus = MZ_ROOT / job.crate / "corpus" / job.target + if entry.proc.returncode != 0: + shutil.rmtree(merged, ignore_errors=True) + say( + f" cmin {job.name}: merge exited {entry.proc.returncode}, kept {before}" + ) + return + after = sum(1 for _ in merged.iterdir()) + shutil.rmtree(corpus) + merged.rename(corpus) + say(f" cmin {job.name}: {before} -> {after}") + + def _shutdown(self) -> None: + if not self.running: + return + self._terminate_all(signal.SIGTERM) + deadline = time.time() + 10 + while self.running and time.time() < deadline: + for job in list(self.running): + assert job.proc is not None + if job.proc.poll() is not None: + job.returncode = job.proc.returncode + job.finished_at = time.time() + self.running.remove(job) + time.sleep(0.2) + if self.running: + self._terminate_all(signal.SIGKILL) + for job in self.running: + assert job.proc is not None + job.proc.wait() + + def _summary(self) -> None: + say( + f"summary: {len(self.succeeded)} ok, {len(self.failed)} failed, " + f"{len(self.pending) + len(self.running)} not finished" + ) + for job in self.failed: + say(f" ✗ {job.name} ({job.artifact_dir})") + + +def _corpus_dirs(crates: list[str]) -> list[str]: + """Existing `src//fuzz/corpus` dirs for `crates`, relative to MZ_ROOT.""" + return [ + str(Path(c) / "corpus") for c in crates if (MZ_ROOT / c / "corpus").is_dir() + ] + + +def upload_corpus(env: dict[str, str], crates: list[str]) -> None: + """Tar this shard's (minimized) corpora and upload them as a Buildkite artifact. + + Only `crates` (the ones this shard actually fuzzed) are uploaded, so the + artifact stays the single shard's corpus. The artifact name is fixed. Shards + are told apart by their Buildkite job, not the filename (see + `download_previous_corpus`). + """ + if not buildkite.is_in_buildkite(): + return + dirs = _corpus_dirs(crates) + if not dirs: + return + try: + subprocess.run( + ["tar", "-caf", CORPUS_ARTIFACT, "-C", str(MZ_ROOT), *dirs], + cwd=MZ_ROOT, + env=env, + check=True, + ) + buildkite.upload_artifact(CORPUS_ARTIFACT, cwd=MZ_ROOT) + say(f"uploaded corpus artifact {CORPUS_ARTIFACT} ({len(dirs)} crate(s))") + except Exception as e: # never fail the run over corpus housekeeping + say(f"corpus upload failed (non-fatal): {e}") + + +def download_previous_corpus(env: dict[str, str]) -> None: + """Seed corpora from the newest prior build that uploaded one. + + Best effort: any failure (missing token, no prior artifact, API hiccup) + just leaves the seed corpora in place and logs a note. + """ + if not buildkite.is_in_buildkite(): + return + try: + from materialize.buildkite_insights.buildkite_api import ( + artifacts_api, + builds_api, + ) + + pipeline = os.environ["BUILDKITE_PIPELINE_SLUG"] + branch = os.environ.get("BUILDKITE_BRANCH") or "main" + builds = builds_api.get_builds( + pipeline, + max_fetches=1, + branch=branch, + build_states=["passed", "failed"], + items_per_page=20, + ) + + # This shard always fuzzes the same crates, so its corpus lives on the + # job for the same part. Parallel jobs carry a 0-based + # `parallel_group_index` (mkpipeline's `%N` label suffix is its human + # form). A non-parallel run has none, so match `None` then. + want_index = ( + buildkite.get_parallelism_index() + if buildkite.get_parallelism_count() > 1 + else None + ) + + def is_my_fuzz_job(j: dict) -> bool: + fields = (j.get("step_key"), j.get("name"), j.get("label")) + return ( + any("cargo-fuzz" in (f or "") for f in fields) + and j.get("parallel_group_index") == want_index + ) + + for build in builds: + job = next((j for j in build.get("jobs", []) if is_my_fuzz_job(j)), None) + if job is None: + continue + artifacts = artifacts_api.get_build_job_artifact_list( + pipeline, build["number"], job["id"] + ) + art = next( + (a for a in artifacts if a.get("filename") == CORPUS_ARTIFACT), None + ) + if art is None: + continue + dest = MZ_ROOT / CORPUS_ARTIFACT + artifacts_api.download_artifact_to_file( + pipeline, build["number"], job["id"], art["id"], str(dest) + ) + subprocess.run( + ["tar", "-xaf", str(dest), "-C", str(MZ_ROOT)], env=env, check=True + ) + say(f"seeded corpus from build #{build['number']}") + return + say("no previous corpus artifact found; starting from seeds") + except Exception as e: + say(f"corpus download failed (non-fatal): {e}") + + +def shard_jobs_by_crate(jobs: list[Job], index: int, count: int) -> list[Job]: + """Select this shard's jobs, assigning whole crates to shards. + + Sharding by target would split a multi-target crate across machines, so its + instrumented crate build would run on every shard owning one of its targets. + Keeping a crate's targets together means each fuzz crate is built on exactly + one machine. Crates are packed onto the least-loaded shard, heaviest first, + so per-shard target counts stay even. + + The shared workspace dependency closure (mz-ore, mz-repr, …) is still built + on every machine, since separate machines don't share a CARGO_TARGET_DIR. The + build is halved down to that common base. + """ + by_crate: dict[str, list[Job]] = {} + for j in jobs: + by_crate.setdefault(j.crate, []).append(j) + # Heaviest crate first. The name is a deterministic tiebreaker so every shard + # computes the identical assignment and they partition the work. + ordered_crates = sorted(by_crate, key=lambda c: (-len(by_crate[c]), c)) + load = [0] * count + owner: dict[str, int] = {} + for crate in ordered_crates: + lightest = min(range(count), key=lambda i: (load[i], i)) + owner[crate] = lightest + load[lightest] += len(by_crate[crate]) + return [j for j in jobs if owner[j.crate] == index] + + +def _jobs_arg(value: str) -> "int | str": + """Parse --jobs: a positive integer, or the literal 'auto'.""" + if value == "auto": + return "auto" + n = int(value) + if n < 1: + raise ValueError("--jobs must be >= 1 or 'auto'") + return n + + +def autosize_jobs( + requested: "int | str", num_targets: int, max_parallel: int +) -> tuple[int, int]: + """Resolve (--jobs, --max-parallel) into concrete (forks, max_parallel). + + For an explicit integer this is a no-op. For 'auto', size fork workers to + this host's core count so every core stays busy: run all targets at once + (or the user's --max-parallel cap) and give each ``round(cores / + concurrent)`` forks. Rounding (rather than flooring) keeps the cores filled + when targets don't divide evenly, at the cost of mild oversubscription that + fork workers tolerate. + + Crucially we launch *every* target at once even when targets outnumber + cores (one fork each, oversubscribing the CPU) rather than running only + `cores` of them and queueing the rest. A single run gives each target a + long ``--max-seconds``. If the first `cores` targets each ran for that whole + budget, the queued targets would never start before the wall clock ran out. + Oversubscription just time-slices the cores, every target still makes + progress, which beats some targets never running at all. A caller who + really wants to bound concurrency can still pass --max-parallel. + """ + if isinstance(requested, int): + return requested, max_parallel + cores = os.cpu_count() or 1 + concurrent = max_parallel or num_targets + # round-half-up: cores/concurrent, biased to keep every core busy. Floors at + # one fork once targets reach/exceed cores. + forks = max(1, (cores + concurrent // 2) // concurrent) + # The runner treats max_parallel=0 as "launch them all". + resolved_parallel = concurrent if concurrent < num_targets else 0 + return forks, resolved_parallel + + +def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None: + parser.add_argument( + "--max-seconds", + type=int, + default=0, + help="-max_total_time per target; 0 = fuzz until crash/Ctrl-C (default)", + ) + parser.add_argument("--rss-limit-mb", type=int, default=4096) + parser.add_argument( + "--sanitizer", + # Default to none on every platform: macOS arm64 ASan segfaults on + # startup, and on Linux ASan only adds ~2-3x slowdown without catching + # our bug class (panics / round-trip drifts, not memory corruption). + # Pass --sanitizer=address to opt back into ASan's memory coverage. + default="none", + help="cargo-fuzz sanitizer to build with (default: none; " + "pass 'address' for ASan memory coverage at ~2-3x slowdown)", + ) + parser.add_argument( + "--jobs", + type=_jobs_arg, + default="auto", + help="libFuzzer fork-mode workers per target (-fork=N): N processes that " + "share the target's corpus. Pass an integer, or 'auto' (the default) to " + "fill the machine: forks are sized to this host's core count (cores / " + "concurrent targets), capping concurrency at the core count when targets " + "outnumber cores. Pass '1' for the old one-process-per-target behavior. " + "--max-parallel still caps concurrent targets.", + ) + parser.add_argument( + "--max-parallel", + type=int, + default=0, + help="cap simultaneously-running targets; 0 = launch them all", + ) + parser.add_argument( + "--target-dir", + default=str(MZ_ROOT / "target"), + help="shared CARGO_TARGET_DIR for every fuzz crate (default: target/)", + ) + parser.add_argument( + "--wall-budget", + type=int, + default=0, + help="overall build+fuzz wall budget in seconds; clamps --max-seconds so " + "the build phase can't blow the step timeout (0 = no cap)", + ) + parser.add_argument( + "--fail-fast", + action="store_true", + help="stop every target as soon as one crashes", + ) + parser.add_argument( + "--no-build", + action="store_true", + help="skip the up-front sequential build phase", + ) + parser.add_argument( + "--no-minimize", + action="store_true", + help="skip the `cargo fuzz cmin` corpus minimization after fuzzing", + ) + parser.add_argument( + "--minimize-timeout", + type=int, + default=0, + help="cap post-fuzz corpus minimization at N seconds so it and the " + "corpus upload finish before the step's hard timeout; 0 = unbounded " + "(default). Set it below the step timeout minus --wall-budget.", + ) + parser.add_argument( + "--corpus-sync", + action="store_true", + help="in Buildkite, seed corpora from the last build's artifact and " + "upload the minimized corpora when done (no-op locally)", + ) + parser.add_argument( + "--profile", + choices=["all", "fruitful"], + default="all", + help="`fruitful` restricts the run to the historically high-yield " + "targets (see FRUITFUL): the SQL-parser round-trip oracles and the rich " + "hand-written PG parsers/decoders that keep finding bugs, ideal for a " + "long local run. `all` (default) runs every target. A `filters` list " + "narrows further within the profile.", + ) + parser.add_argument( + "filters", + nargs="*", + help="only run crate::target identifiers containing one of these substrings", + ) + args = parser.parse_args() + + target_dir = Path(args.target_dir) + env = fuzz_env(target_dir) + log_dir = target_dir / "fuzz-logs" + log_dir.mkdir(parents=True, exist_ok=True) + + jobs: list[Job] = [] + for crate in FUZZ_CRATES: + for target in list_targets(crate, env): + job = Job( + crate=crate, + target=target, + log_path=log_dir / f"{Path(crate).parent.name}_{target}.log", + ) + matches_filters = not args.filters or any( + f in job.name for f in args.filters + ) + matches_profile = args.profile == "all" or any( + f in job.name for f in FRUITFUL + ) + if matches_filters and matches_profile: + jobs.append(job) + + if not jobs: + raise ui.UIError( + f"no fuzz targets matched profile={args.profile} filters={args.filters}" + ) + + # Split the work across machines by crate (see shard_jobs_by_crate). The + # shard identity is Buildkite's standard parallelism, which `get_parallelism_*` + # read from BUILDKITE_PARALLEL_JOB[_COUNT]. Set those by hand to shard a local + # run (e.g. `BUILDKITE_PARALLEL_JOB=0 BUILDKITE_PARALLEL_JOB_COUNT=2`). + shard_index = buildkite.get_parallelism_index() + shard_count = buildkite.get_parallelism_count() + total = len(jobs) + if shard_count > 1: + jobs = shard_jobs_by_crate(jobs, shard_index, shard_count) + if not jobs: + say("this shard has no targets; nothing to do") + return + # The crates this shard owns (declaration order). Only these get their + # corpus prepared, fuzzed, and uploaded. + shard_crates = [c for c in FUZZ_CRATES if any(j.crate == c for j in jobs)] + if shard_count > 1: + say( + f"shard {shard_index}/{shard_count}: {len(jobs)} of {total} targets " + f"across {len(shard_crates)} crate(s): {sorted(j.name for j in jobs)}" + ) + + jobs_per_target, max_parallel = autosize_jobs( + args.jobs, len(jobs), args.max_parallel + ) + if args.jobs == "auto": + say( + f"--jobs auto: {os.cpu_count() or 1} core(s), {len(jobs)} target(s) " + f"this shard -> -fork={jobs_per_target}" + + (f", max-parallel={max_parallel}" if max_parallel else " (all at once)") + ) + + runner = FuzzRunner( + jobs=jobs, + env=env, + max_seconds=args.max_seconds, + rss_limit_mb=args.rss_limit_mb, + jobs_per_target=jobs_per_target, + max_parallel=max_parallel, + fail_fast=args.fail_fast, + wall_budget=args.wall_budget, + minimize=not args.no_minimize, + minimize_timeout=args.minimize_timeout, + sanitizer=args.sanitizer, + triple=host_triple(env), + ) + if args.corpus_sync: + download_previous_corpus(env) + # Seed only this shard's corpora. The other crates are never fuzzed here. + for crate in shard_crates: + prepare_corpus(crate, env) + if not args.no_build: + runner.build() + failed = runner.run() + if args.corpus_sync: + # After run() (which has minimized) so we upload the lean corpus, and + # before the raise below so it persists even when a target crashed. + upload_corpus(env, shard_crates) + if failed: + raise ui.UIError( + f"{len(failed)} fuzz target(s) crashed: {[j.name for j in failed]}" + ) diff --git a/test/cargo-fuzz/proto.dict b/test/cargo-fuzz/proto.dict new file mode 100644 index 0000000000000..1857a6b94d150 --- /dev/null +++ b/test/cargo-fuzz/proto.dict @@ -0,0 +1,59 @@ +# libFuzzer dictionary for protobuf round-trip fuzz targets. +# +# Protobuf is self-describing through field tags: tag = (field_number << 3) | +# wire_type, where wire_type is 0=varint, 1=i64, 2=len-delimited, 5=i32. +# Seeding the mutator with single-byte tags for the low field numbers (which +# cover almost every real message) lets it assemble structurally-valid messages +# instead of bouncing off "invalid wire type" / "unexpected EOF" early-outs. + +# Field 1 +"\x08" +"\x09" +"\x0a" +"\x0d" +# Field 2 +"\x10" +"\x12" +# Field 3 +"\x18" +"\x1a" +# Field 4 +"\x20" +"\x22" +# Field 5 +"\x28" +"\x2a" +# Field 6 +"\x30" +"\x32" +# Field 7 +"\x38" +"\x3a" +# Field 8 +"\x40" +"\x42" +# Field 9 +"\x48" +"\x4a" +# Field 10 +"\x50" +"\x52" +# Fields 11-15 (varint + len-delimited) +"\x58" +"\x5a" +"\x60" +"\x62" +"\x68" +"\x6a" +"\x70" +"\x72" +"\x78" +"\x7a" +# Two-byte tag prefix for field numbers >= 16 +"\x80\x01" +"\x82\x01" +# Common small varint payloads / length prefixes +"\x00" +"\x01" +"\x02" +"\xff\xff\xff\xff\x0f" diff --git a/test/sqllogictest/error_semantics.slt b/test/sqllogictest/error_semantics.slt index 824c26a60490b..1f504c2379521 100644 --- a/test/sqllogictest/error_semantics.slt +++ b/test/sqllogictest/error_semantics.slt @@ -93,3 +93,16 @@ select coalesce(a, (select a/b from test)) from test; query error Evaluation error: division by zero select *, case when a = 5 then (select a/b from test) else 7 end from test; + +# Regression for #37049. AND/OR are non-strict: a `false`/`true` operand absorbs +# an erroring operand at runtime (`false AND ` is `false`), so `reduce` +# must not fold `x AND ` / `x OR ` to the error. Before the fix +# these spuriously errored with division by zero (test.a = 1). +query I +select a from test where (a = 2) and (1/0 = 1); +---- + +query I +select a from test where (a = 1) or (1/0 = 1); +---- +1 diff --git a/test/sqllogictest/temporal.slt b/test/sqllogictest/temporal.slt index bf67de30f4401..2d9cfdab8b1cf 100644 --- a/test/sqllogictest/temporal.slt +++ b/test/sqllogictest/temporal.slt @@ -440,3 +440,73 @@ FROM (SELECT CAST('62143-12-31' AS date) + '200000 YEARS' AS c2 FROM (SELECT FROM (SELECT FROM t) AS subq_0) AS subq_1 WHERE pg_catalog."current_timestamp"() = pg_catalog.pg_postmaster_start_time()) AS subq_2; + +# Regression test for https://linear.app/materializeinc/issue/CLU-137 +# +# A temporal predicate (`mz_now() < ...`) shared across both branches of a +# disjunction of conjunctions must be factored out of the disjunction, so that +# it becomes a standalone conjunct the renderer can extract as a temporal +# bound. The factoring is done by `undistribute_and_or` during predicate +# canonicalization. The error-propagation fix in #37049 had skipped +# undistribution for any expression that could error, and `mz_now()` always +# counts as could-error, so the temporal predicate stayed buried inside the OR +# and the renderer panicked with "Unsupported temporal predicate". + +# A constant collection (`VALUES`) is readable at any logical time, so the MV +# can be queried `AS OF` a small timestamp, like the `valid` MV above. +statement ok +CREATE VIEW clu137 (active, confirmed, enabled, label, valid_until) AS VALUES + (true, true, true, CAST(NULL AS text), 10), + (true, true, true, '', 20), + (true, true, true, 'x', 30), + (false, true, true, NULL, 40); + +statement ok +CREATE MATERIALIZED VIEW clu137_mv AS +SELECT valid_until FROM clu137 +WHERE (active AND confirmed AND enabled AND label IS NULL AND mz_now() < valid_until) + OR (active AND confirmed AND enabled AND label = '' AND mz_now() < valid_until); + +query I rowsort +SELECT * FROM clu137_mv AS OF 5; +---- +10 +20 + +query I rowsort +SELECT * FROM clu137_mv AS OF 15; +---- +20 + +query I rowsort +SELECT * FROM clu137_mv AS OF 25; +---- + +# a non-common operand that could error keeps the shared temporal predicate +# buried in the OR (factoring it out would change error semantics), so +# extraction rejects it at plan time. A clean error, not a crash. +statement ok +CREATE VIEW clu137_fallible (active, label, divisor, valid_until) AS VALUES + (true, CAST(NULL AS text), 5, 10), + (true, '', 5, 20); + +# `(100 / divisor) > 0` (can error) sits in only the second disjunct. +statement error Unsupported temporal predicate +CREATE MATERIALIZED VIEW clu137_fallible_mv AS +SELECT valid_until FROM clu137_fallible +WHERE (active AND label IS NULL AND mz_now() < valid_until) + OR (active AND label = '' AND mz_now() < valid_until AND (100 / divisor) > 0); + +# But a fallible operand common to every disjunct factors out fine: the limit is +# commonality, not fallibility. +statement ok +CREATE MATERIALIZED VIEW clu137_common_fallible_mv AS +SELECT valid_until FROM clu137_fallible +WHERE (active AND label IS NULL AND mz_now() < valid_until AND (100 / divisor) > 0) + OR (active AND label = '' AND mz_now() < valid_until AND (100 / divisor) > 0); + +query I rowsort +SELECT * FROM clu137_common_fallible_mv AS OF 5; +---- +10 +20