From f978f441d32f11ab4960667684cf5f038edf6a7b Mon Sep 17 00:00:00 2001
From: Travis Downs <travis.downs@redpanda.com>
Date: Wed, 13 May 2026 17:54:36 -0400
Subject: [PATCH] kafka/protocol: make schemata codegen reproducible
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The generator iterated Python sets when emitting #include lines, so the
output of each codegen action varied with PYTHONHASHSEED (which CPython
picks fresh per interpreter). The C++ files were preprocessor-equivalent
across runs, so the final binary was unchanged, but Bazel keys its
action cache on input content hashes — non-deterministic codegen output
invalidates every downstream compile's cache key. On a shared remote
cache this means every developer compiling the kafka layer misses on
every action that consumes the generated headers.

Sort the header sets at the two sites where they're iterated by Jinja:
StructType.headers() (per-schema header template) and the
extra_schema_headers passed into COMBINED_SOURCE_TEMPLATE (per-schema
source template).

Add a small py_test that runs the generator with two very different
PYTHONHASHSEED values across a handful of representative schemata and
fails if any output byte differs.
---
 src/v/kafka/protocol/schemata/BUILD           | 21 ++++-
 src/v/kafka/protocol/schemata/generator.py    |  4 +-
 .../generator_reproducibility_test.py         | 79 +++++++++++++++++++
 3 files changed, 101 insertions(+), 3 deletions(-)
 create mode 100644 src/v/kafka/protocol/schemata/generator_reproducibility_test.py

diff --git a/src/v/kafka/protocol/schemata/BUILD b/src/v/kafka/protocol/schemata/BUILD
index 2ecaf0eaa31d2..cd3d7531ef023 100644
--- a/src/v/kafka/protocol/schemata/BUILD
+++ b/src/v/kafka/protocol/schemata/BUILD
@@ -1,4 +1,4 @@
-load("@rules_python//python:defs.bzl", "py_binary")
+load("@rules_python//python:defs.bzl", "py_binary", "py_test")
 load("//src/v/kafka/protocol/schemata:generator.bzl", "generate_kafka_messages")
 
 py_binary(
@@ -12,4 +12,23 @@ py_binary(
     ],
 )
 
+py_test(
+    name = "generator_reproducibility_test",
+    size = "small",
+    srcs = ["generator_reproducibility_test.py"],
+    data = [
+        "create_topics_request.json",
+        "create_topics_response.json",
+        "describe_configs_response.json",
+        "fetch_request.json",
+        "fetch_response.json",
+        "generator.py",
+        "metadata_response.json",
+    ],
+    deps = [
+        "@python_deps//jinja2",
+        "@python_deps//jsonschema",
+    ],
+)
+
 generate_kafka_messages()
diff --git a/src/v/kafka/protocol/schemata/generator.py b/src/v/kafka/protocol/schemata/generator.py
index c880e4752fff0..36338b82529c8 100755
--- a/src/v/kafka/protocol/schemata/generator.py
+++ b/src/v/kafka/protocol/schemata/generator.py
@@ -995,7 +995,7 @@ def type_headers(t):
             h = h.get(which, ())
             yield from maybe_strings(h)
 
-        return set(h for t in types for h in type_headers(t))
+        return sorted({h for t in types for h in type_headers(t)})
 
     @property
     def is_default_comparable(self):
@@ -2171,7 +2171,7 @@ def fail(msg):
 
     src = jinja2.Template(COMBINED_SOURCE_TEMPLATE).render(
         schema_headers=map(lambda p: p.name, headers),
-        extra_headers=extra_schema_headers,
+        extra_headers=sorted(extra_schema_headers),
         sources=sources,
     )
 
diff --git a/src/v/kafka/protocol/schemata/generator_reproducibility_test.py b/src/v/kafka/protocol/schemata/generator_reproducibility_test.py
new file mode 100644
index 0000000000000..c3c5c42a6d745
--- /dev/null
+++ b/src/v/kafka/protocol/schemata/generator_reproducibility_test.py
@@ -0,0 +1,79 @@
+# Copyright 2026 Redpanda Data, Inc.
+#
+# Use of this software is governed by the Business Source License
+# included in the file licenses/BSL.md
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0
+"""Hermeticity regression test for the kafka schemata codegen.
+
+The generator emits C++ source from JSON schemas. Earlier versions iterated
+Python sets when emitting #include lines, so the byte-level output varied with
+PYTHONHASHSEED (which CPython picks fresh per interpreter). Bazel keys its
+action cache on input content hashes, so non-deterministic codegen invalidates
+the cache for every downstream compile even when the result is
+preprocessor-equivalent.
+
+This test runs the generator with several PYTHONHASHSEED values and fails if
+any output byte differs.
+"""
+
+import os
+import subprocess
+import sys
+import tempfile
+import unittest
+from pathlib import Path
+
+_HERE = Path(__file__).resolve().parent
+_GENERATOR = _HERE / "generator.py"
+
+_SEEDS = ("1", "4294967295")
+_EXTS = ("h", "cc")
+
+# Schemata chosen to exercise the codegen paths most likely to expose set-order
+# bugs: ones whose fields pull in multiple `extra_headers` entries.
+_SCHEMATA = (
+    "create_topics_request",
+    "create_topics_response",
+    "fetch_request",
+    "fetch_response",
+    "metadata_response",
+    "describe_configs_response",
+)
+
+
+def _run_generator(schema: str, seed: str, outdir: Path) -> dict[str, bytes]:
+    """Run the generator once and return {ext: bytes} for the produced files."""
+    outdir.mkdir()
+    subprocess.run(
+        [
+            sys.executable,
+            str(_GENERATOR),
+            str(_HERE / f"{schema}.json"),
+            *(str(outdir / f"{schema}.{ext}") for ext in _EXTS),
+        ],
+        check=True,
+        env={**os.environ, "PYTHONHASHSEED": seed},
+    )
+    return {ext: (outdir / f"{schema}.{ext}").read_bytes() for ext in _EXTS}
+
+
+class GeneratorReproducibilityTest(unittest.TestCase):
+    def test_codegen_is_hash_seed_independent(self) -> None:
+        for schema in _SCHEMATA:
+            with self.subTest(schema=schema), tempfile.TemporaryDirectory() as tmp:
+                outputs = [
+                    _run_generator(schema, s, Path(tmp) / f"seed{s}") for s in _SEEDS
+                ]
+                for ext in _EXTS:
+                    self.assertEqual(
+                        {o[ext] for o in outputs},
+                        {outputs[0][ext]},
+                        f"{schema}.{ext} varies across PYTHONHASHSEED values",
+                    )
+
+
+if __name__ == "__main__":
+    unittest.main()