From f978f441d32f11ab4960667684cf5f038edf6a7b Mon Sep 17 00:00:00 2001 From: Travis Downs Date: Wed, 13 May 2026 17:54:36 -0400 Subject: [PATCH] kafka/protocol: make schemata codegen reproducible MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The generator iterated Python sets when emitting #include lines, so the output of each codegen action varied with PYTHONHASHSEED (which CPython picks fresh per interpreter). The C++ files were preprocessor-equivalent across runs, so the final binary was unchanged, but Bazel keys its action cache on input content hashes — non-deterministic codegen output invalidates every downstream compile's cache key. On a shared remote cache this means every developer compiling the kafka layer misses on every action that consumes the generated headers. Sort the header sets at the two sites where they're iterated by Jinja: StructType.headers() (per-schema header template) and the extra_schema_headers passed into COMBINED_SOURCE_TEMPLATE (per-schema source template). Add a small py_test that runs the generator with two very different PYTHONHASHSEED values across a handful of representative schemata and fails if any output byte differs. --- src/v/kafka/protocol/schemata/BUILD | 21 ++++- src/v/kafka/protocol/schemata/generator.py | 4 +- .../generator_reproducibility_test.py | 79 +++++++++++++++++++ 3 files changed, 101 insertions(+), 3 deletions(-) create mode 100644 src/v/kafka/protocol/schemata/generator_reproducibility_test.py diff --git a/src/v/kafka/protocol/schemata/BUILD b/src/v/kafka/protocol/schemata/BUILD index 2ecaf0eaa31d2..cd3d7531ef023 100644 --- a/src/v/kafka/protocol/schemata/BUILD +++ b/src/v/kafka/protocol/schemata/BUILD @@ -1,4 +1,4 @@ -load("@rules_python//python:defs.bzl", "py_binary") +load("@rules_python//python:defs.bzl", "py_binary", "py_test") load("//src/v/kafka/protocol/schemata:generator.bzl", "generate_kafka_messages") py_binary( @@ -12,4 +12,23 @@ py_binary( ], ) +py_test( + name = "generator_reproducibility_test", + size = "small", + srcs = ["generator_reproducibility_test.py"], + data = [ + "create_topics_request.json", + "create_topics_response.json", + "describe_configs_response.json", + "fetch_request.json", + "fetch_response.json", + "generator.py", + "metadata_response.json", + ], + deps = [ + "@python_deps//jinja2", + "@python_deps//jsonschema", + ], +) + generate_kafka_messages() diff --git a/src/v/kafka/protocol/schemata/generator.py b/src/v/kafka/protocol/schemata/generator.py index c880e4752fff0..36338b82529c8 100755 --- a/src/v/kafka/protocol/schemata/generator.py +++ b/src/v/kafka/protocol/schemata/generator.py @@ -995,7 +995,7 @@ def type_headers(t): h = h.get(which, ()) yield from maybe_strings(h) - return set(h for t in types for h in type_headers(t)) + return sorted({h for t in types for h in type_headers(t)}) @property def is_default_comparable(self): @@ -2171,7 +2171,7 @@ def fail(msg): src = jinja2.Template(COMBINED_SOURCE_TEMPLATE).render( schema_headers=map(lambda p: p.name, headers), - extra_headers=extra_schema_headers, + extra_headers=sorted(extra_schema_headers), sources=sources, ) diff --git a/src/v/kafka/protocol/schemata/generator_reproducibility_test.py b/src/v/kafka/protocol/schemata/generator_reproducibility_test.py new file mode 100644 index 0000000000000..c3c5c42a6d745 --- /dev/null +++ b/src/v/kafka/protocol/schemata/generator_reproducibility_test.py @@ -0,0 +1,79 @@ +# Copyright 2026 Redpanda Data, Inc. +# +# Use of this software is governed by the Business Source License +# included in the file licenses/BSL.md +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0 +"""Hermeticity regression test for the kafka schemata codegen. + +The generator emits C++ source from JSON schemas. Earlier versions iterated +Python sets when emitting #include lines, so the byte-level output varied with +PYTHONHASHSEED (which CPython picks fresh per interpreter). Bazel keys its +action cache on input content hashes, so non-deterministic codegen invalidates +the cache for every downstream compile even when the result is +preprocessor-equivalent. + +This test runs the generator with several PYTHONHASHSEED values and fails if +any output byte differs. +""" + +import os +import subprocess +import sys +import tempfile +import unittest +from pathlib import Path + +_HERE = Path(__file__).resolve().parent +_GENERATOR = _HERE / "generator.py" + +_SEEDS = ("1", "4294967295") +_EXTS = ("h", "cc") + +# Schemata chosen to exercise the codegen paths most likely to expose set-order +# bugs: ones whose fields pull in multiple `extra_headers` entries. +_SCHEMATA = ( + "create_topics_request", + "create_topics_response", + "fetch_request", + "fetch_response", + "metadata_response", + "describe_configs_response", +) + + +def _run_generator(schema: str, seed: str, outdir: Path) -> dict[str, bytes]: + """Run the generator once and return {ext: bytes} for the produced files.""" + outdir.mkdir() + subprocess.run( + [ + sys.executable, + str(_GENERATOR), + str(_HERE / f"{schema}.json"), + *(str(outdir / f"{schema}.{ext}") for ext in _EXTS), + ], + check=True, + env={**os.environ, "PYTHONHASHSEED": seed}, + ) + return {ext: (outdir / f"{schema}.{ext}").read_bytes() for ext in _EXTS} + + +class GeneratorReproducibilityTest(unittest.TestCase): + def test_codegen_is_hash_seed_independent(self) -> None: + for schema in _SCHEMATA: + with self.subTest(schema=schema), tempfile.TemporaryDirectory() as tmp: + outputs = [ + _run_generator(schema, s, Path(tmp) / f"seed{s}") for s in _SEEDS + ] + for ext in _EXTS: + self.assertEqual( + {o[ext] for o in outputs}, + {outputs[0][ext]}, + f"{schema}.{ext} varies across PYTHONHASHSEED values", + ) + + +if __name__ == "__main__": + unittest.main()