dt/sr: Stress test for RPC resilience to leadership transfers

oleiman · oleiman · commit 6bab74c02227 · 2026-04-20T15:06:21.000-07:00
Signed-off-by: Oren Leiman &lt;oren.leiman@redpanda.com&gt;
diff --git a/tests/rptest/tests/schema_registry_test.py b/tests/rptest/tests/schema_registry_test.py
@@ -10791,3 +10791,261 @@ def __init__(self, context: TestContext, **kwargs: Any):
             extra_rp_conf={"schema_registry_use_rpc": True},
             **kwargs,
         )
+
+
+class SchemaRegistryTransportStressTest(SchemaRegistryEndpoints):
+    """
+    Stress test for schema registry transport resilience. Performs
+    concurrent SR read/write operations while transferring leadership of
+    the _schemas topic to verify zero HTTP 500 errors surface to clients.
+
+    Parameterized over both transport modes so behavior can be compared
+    and tuned together.
+    """
+
+    def __init__(self, context: TestContext, **kwargs):
+        super().__init__(
+            context,
+            **kwargs,
+        )
+
+    @cluster(num_nodes=3)
+    def test_no_errors_during_leadership_transfers(self):
+        import threading
+
+        admin = Admin(self.redpanda)
+
+        # --- Setup: register initial schemas so reads have data ---
+        num_subjects = 3
+        schema_ids = []
+        subjects = []
+        for i in range(num_subjects):
+            subject = f"stress-test-subject-{i}"
+            subjects.append(subject)
+            data = json.dumps(
+                {
+                    "schema": json.dumps(
+                        {
+                            "type": "record",
+                            "name": f"rec{i}",
+                            "fields": [{"name": "f1", "type": "string"}],
+                        }
+                    ),
+                }
+            )
+            result = self.sr_client.post_subjects_subject_versions(
+                subject=subject, data=data
+            )
+            assert result.status_code == 200, (
+                f"Setup: failed to register schema: {result.status_code} {result.text}"
+            )
+            schema_ids.append(result.json()["id"])
+
+        self.logger.info(
+            f"Setup complete: {num_subjects} subjects, schema_ids={schema_ids}"
+        )
+
+        # --- Background workers ---
+        request_counter = 0
+        request_counter_lock = threading.Lock()
+        errors: list[str] = []
+        stop_event = threading.Event()
+
+        def count_request():
+            nonlocal request_counter
+            with request_counter_lock:
+                request_counter += 1
+
+        # Short timeout so threads don't block teardown.
+        req_timeout = 10
+
+        def reader_worker():
+            """Continuously read subjects and schemas from random nodes."""
+            while not stop_event.is_set():
+                for node in self.redpanda.nodes:
+                    if stop_event.is_set():
+                        break
+                    hostname = node.account.hostname
+                    try:
+                        count_request()
+                        r = self.sr_client.get_subjects(
+                            hostname=hostname, timeout=req_timeout
+                        )
+                        if r.status_code == 500:
+                            errors.append(f"GET /subjects on {hostname}: 500 {r.text}")
+                        for sid in schema_ids:
+                            if stop_event.is_set():
+                                break
+                            count_request()
+                            r = self.sr_client.request(
+                                "GET",
+                                f"schemas/ids/{sid}",
+                                hostname=hostname,
+                                headers=HTTP_GET_HEADERS,
+                                timeout=req_timeout,
+                            )
+                            if r.status_code == 500:
+                                errors.append(
+                                    f"GET /schemas/ids/{sid} on {hostname}: "
+                                    f"500 {r.text}"
+                                )
+                    except Exception as e:
+                        self.logger.warn(f"Reader exception on {hostname}: {e}")
+
+        def writer_worker():
+            """Continuously register new schema versions."""
+            seq = 0
+            while not stop_event.is_set():
+                seq += 1
+                subject = subjects[seq % num_subjects]
+                data = json.dumps(
+                    {
+                        "schema": json.dumps(
+                            {
+                                "type": "record",
+                                "name": f"rec{seq % num_subjects}",
+                                "fields": [
+                                    {"name": "f1", "type": ["null", "string"]},
+                                    {
+                                        "name": f"f_write_{seq}",
+                                        "type": "string",
+                                        "default": "x",
+                                    },
+                                ],
+                            }
+                        ),
+                    }
+                )
+                try:
+                    count_request()
+                    r = self.sr_client.post_subjects_subject_versions(
+                        subject=subject,
+                        data=data,
+                        timeout=req_timeout,
+                    )
+                    if r.status_code == 500:
+                        errors.append(
+                            f"POST /subjects/{subject}/versions: 500 {r.text}"
+                        )
+                except Exception as e:
+                    self.logger.warn(f"Writer exception: {e}")
+
+                # Pace writes to avoid overwhelming the cluster
+                time.sleep(0.5)
+
+        # Start 2 reader threads and 1 writer thread
+        threads = []
+        for _ in range(2):
+            t = threading.Thread(target=reader_worker)
+            t.start()
+            threads.append(t)
+        t = threading.Thread(target=writer_worker)
+        t.start()
+        threads.append(t)
+
+        # --- Perturbation: leadership transfers ---
+        num_transfers = 20
+        node_ids = [self.redpanda.node_id(n) for n in self.redpanda.nodes]
+        for i in range(num_transfers):
+            leader = admin.get_partition_leader(
+                namespace="kafka", topic="_schemas", partition=0
+            )
+            # Pick a specific target so the same node doesn't re-elect itself.
+            targets = [n for n in node_ids if n != leader]
+            target = targets[i % len(targets)]
+            self.logger.info(
+                f"Transfer {i + 1}/{num_transfers}: moving leadership "
+                f"from node {leader} to node {target}"
+            )
+            admin.partition_transfer_leadership(
+                namespace="kafka",
+                topic="_schemas",
+                partition=0,
+                target_id=target,
+            )
+
+            def leader_changed():
+                new_leader = admin.get_partition_leader(
+                    namespace="kafka", topic="_schemas", partition=0
+                )
+                return new_leader != leader
+
+            wait_until(
+                leader_changed,
+                timeout_sec=10,
+                backoff_sec=1,
+                err_msg="Leadership did not transfer",
+            )
+            # Brief pause between transfers
+            time.sleep(1)
+
+        # --- Teardown ---
+        stop_event.set()
+        for t in threads:
+            t.join(timeout=req_timeout + 5)
+        alive = [t for t in threads if t.is_alive()]
+        assert not alive, f"{len(alive)} worker thread(s) still alive after join"
+
+        total_requests = request_counter
+        error_rate = len(errors) / total_requests if total_requests else 0
+        self.logger.info(
+            f"Stress test complete: {num_transfers} leadership transfers, "
+            f"{total_requests} requests, {len(errors)} errors "
+            f"({error_rate:.2%})"
+        )
+
+        # A small number of transient 500s during rapid leadership
+        # transfers is acceptable — the internal retry budget can be
+        # exhausted if a transfer is slow to propagate. The important
+        # thing is that the error rate is low: the system recovers
+        # quickly and subsequent requests succeed.
+        assert error_rate < 0.05, (
+            f"Error rate {error_rate:.2%} exceeds 5% threshold "
+            f"({len(errors)} errors in {total_requests} requests):\n"
+            + "\n".join(errors[:20])
+        )
+
+        # Verify the system recovers after transfers complete: every
+        # node must be able to serve a basic read within a reasonable
+        # window. wait_until absorbs transient CI slowness while still
+        # catching real breakage.
+        def all_nodes_healthy():
+            for node in self.redpanda.nodes:
+                r = self.sr_client.get_subjects(
+                    hostname=node.account.hostname, timeout=req_timeout
+                )
+                if r.status_code != 200:
+                    return False
+            return True
+
+        wait_until(
+            all_nodes_healthy,
+            timeout_sec=30,
+            backoff_sec=2,
+            err_msg="Schema registry did not recover on all nodes "
+            "after leadership transfers",
+        )
+
+
+class SchemaRegistryRpcTransportStressTest(SchemaRegistryTransportStressTest):
+    """
+    RPC transport variant of the leadership transfer stress test.
+    """
+
+    def __init__(self, context: TestContext):
+        super().__init__(
+            context,
+            extra_rp_conf={"schema_registry_use_rpc": True},
+        )
+
+
+class SchemaRegistryKafkaClientTransportStressTest(SchemaRegistryTransportStressTest):
+    """
+    Kafka client transport variant of the leadership transfer stress test.
+    """
+
+    def __init__(self, context: TestContext):
+        super().__init__(
+            context,
+            extra_rp_conf={"schema_registry_use_rpc": False},
+        )