Skip to content

Commit 6bab74c

Browse files
committed
dt/sr: Stress test for RPC resilience to leadership transfers
Signed-off-by: Oren Leiman <oren.leiman@redpanda.com>
1 parent 467990b commit 6bab74c

1 file changed

Lines changed: 258 additions & 0 deletions

File tree

tests/rptest/tests/schema_registry_test.py

Lines changed: 258 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10791,3 +10791,261 @@ def __init__(self, context: TestContext, **kwargs: Any):
1079110791
extra_rp_conf={"schema_registry_use_rpc": True},
1079210792
**kwargs,
1079310793
)
10794+
10795+
10796+
class SchemaRegistryTransportStressTest(SchemaRegistryEndpoints):
10797+
"""
10798+
Stress test for schema registry transport resilience. Performs
10799+
concurrent SR read/write operations while transferring leadership of
10800+
the _schemas topic to verify zero HTTP 500 errors surface to clients.
10801+
10802+
Parameterized over both transport modes so behavior can be compared
10803+
and tuned together.
10804+
"""
10805+
10806+
def __init__(self, context: TestContext, **kwargs):
10807+
super().__init__(
10808+
context,
10809+
**kwargs,
10810+
)
10811+
10812+
@cluster(num_nodes=3)
10813+
def test_no_errors_during_leadership_transfers(self):
10814+
import threading
10815+
10816+
admin = Admin(self.redpanda)
10817+
10818+
# --- Setup: register initial schemas so reads have data ---
10819+
num_subjects = 3
10820+
schema_ids = []
10821+
subjects = []
10822+
for i in range(num_subjects):
10823+
subject = f"stress-test-subject-{i}"
10824+
subjects.append(subject)
10825+
data = json.dumps(
10826+
{
10827+
"schema": json.dumps(
10828+
{
10829+
"type": "record",
10830+
"name": f"rec{i}",
10831+
"fields": [{"name": "f1", "type": "string"}],
10832+
}
10833+
),
10834+
}
10835+
)
10836+
result = self.sr_client.post_subjects_subject_versions(
10837+
subject=subject, data=data
10838+
)
10839+
assert result.status_code == 200, (
10840+
f"Setup: failed to register schema: {result.status_code} {result.text}"
10841+
)
10842+
schema_ids.append(result.json()["id"])
10843+
10844+
self.logger.info(
10845+
f"Setup complete: {num_subjects} subjects, schema_ids={schema_ids}"
10846+
)
10847+
10848+
# --- Background workers ---
10849+
request_counter = 0
10850+
request_counter_lock = threading.Lock()
10851+
errors: list[str] = []
10852+
stop_event = threading.Event()
10853+
10854+
def count_request():
10855+
nonlocal request_counter
10856+
with request_counter_lock:
10857+
request_counter += 1
10858+
10859+
# Short timeout so threads don't block teardown.
10860+
req_timeout = 10
10861+
10862+
def reader_worker():
10863+
"""Continuously read subjects and schemas from random nodes."""
10864+
while not stop_event.is_set():
10865+
for node in self.redpanda.nodes:
10866+
if stop_event.is_set():
10867+
break
10868+
hostname = node.account.hostname
10869+
try:
10870+
count_request()
10871+
r = self.sr_client.get_subjects(
10872+
hostname=hostname, timeout=req_timeout
10873+
)
10874+
if r.status_code == 500:
10875+
errors.append(f"GET /subjects on {hostname}: 500 {r.text}")
10876+
for sid in schema_ids:
10877+
if stop_event.is_set():
10878+
break
10879+
count_request()
10880+
r = self.sr_client.request(
10881+
"GET",
10882+
f"schemas/ids/{sid}",
10883+
hostname=hostname,
10884+
headers=HTTP_GET_HEADERS,
10885+
timeout=req_timeout,
10886+
)
10887+
if r.status_code == 500:
10888+
errors.append(
10889+
f"GET /schemas/ids/{sid} on {hostname}: "
10890+
f"500 {r.text}"
10891+
)
10892+
except Exception as e:
10893+
self.logger.warn(f"Reader exception on {hostname}: {e}")
10894+
10895+
def writer_worker():
10896+
"""Continuously register new schema versions."""
10897+
seq = 0
10898+
while not stop_event.is_set():
10899+
seq += 1
10900+
subject = subjects[seq % num_subjects]
10901+
data = json.dumps(
10902+
{
10903+
"schema": json.dumps(
10904+
{
10905+
"type": "record",
10906+
"name": f"rec{seq % num_subjects}",
10907+
"fields": [
10908+
{"name": "f1", "type": ["null", "string"]},
10909+
{
10910+
"name": f"f_write_{seq}",
10911+
"type": "string",
10912+
"default": "x",
10913+
},
10914+
],
10915+
}
10916+
),
10917+
}
10918+
)
10919+
try:
10920+
count_request()
10921+
r = self.sr_client.post_subjects_subject_versions(
10922+
subject=subject,
10923+
data=data,
10924+
timeout=req_timeout,
10925+
)
10926+
if r.status_code == 500:
10927+
errors.append(
10928+
f"POST /subjects/{subject}/versions: 500 {r.text}"
10929+
)
10930+
except Exception as e:
10931+
self.logger.warn(f"Writer exception: {e}")
10932+
10933+
# Pace writes to avoid overwhelming the cluster
10934+
time.sleep(0.5)
10935+
10936+
# Start 2 reader threads and 1 writer thread
10937+
threads = []
10938+
for _ in range(2):
10939+
t = threading.Thread(target=reader_worker)
10940+
t.start()
10941+
threads.append(t)
10942+
t = threading.Thread(target=writer_worker)
10943+
t.start()
10944+
threads.append(t)
10945+
10946+
# --- Perturbation: leadership transfers ---
10947+
num_transfers = 20
10948+
node_ids = [self.redpanda.node_id(n) for n in self.redpanda.nodes]
10949+
for i in range(num_transfers):
10950+
leader = admin.get_partition_leader(
10951+
namespace="kafka", topic="_schemas", partition=0
10952+
)
10953+
# Pick a specific target so the same node doesn't re-elect itself.
10954+
targets = [n for n in node_ids if n != leader]
10955+
target = targets[i % len(targets)]
10956+
self.logger.info(
10957+
f"Transfer {i + 1}/{num_transfers}: moving leadership "
10958+
f"from node {leader} to node {target}"
10959+
)
10960+
admin.partition_transfer_leadership(
10961+
namespace="kafka",
10962+
topic="_schemas",
10963+
partition=0,
10964+
target_id=target,
10965+
)
10966+
10967+
def leader_changed():
10968+
new_leader = admin.get_partition_leader(
10969+
namespace="kafka", topic="_schemas", partition=0
10970+
)
10971+
return new_leader != leader
10972+
10973+
wait_until(
10974+
leader_changed,
10975+
timeout_sec=10,
10976+
backoff_sec=1,
10977+
err_msg="Leadership did not transfer",
10978+
)
10979+
# Brief pause between transfers
10980+
time.sleep(1)
10981+
10982+
# --- Teardown ---
10983+
stop_event.set()
10984+
for t in threads:
10985+
t.join(timeout=req_timeout + 5)
10986+
alive = [t for t in threads if t.is_alive()]
10987+
assert not alive, f"{len(alive)} worker thread(s) still alive after join"
10988+
10989+
total_requests = request_counter
10990+
error_rate = len(errors) / total_requests if total_requests else 0
10991+
self.logger.info(
10992+
f"Stress test complete: {num_transfers} leadership transfers, "
10993+
f"{total_requests} requests, {len(errors)} errors "
10994+
f"({error_rate:.2%})"
10995+
)
10996+
10997+
# A small number of transient 500s during rapid leadership
10998+
# transfers is acceptable — the internal retry budget can be
10999+
# exhausted if a transfer is slow to propagate. The important
11000+
# thing is that the error rate is low: the system recovers
11001+
# quickly and subsequent requests succeed.
11002+
assert error_rate < 0.05, (
11003+
f"Error rate {error_rate:.2%} exceeds 5% threshold "
11004+
f"({len(errors)} errors in {total_requests} requests):\n"
11005+
+ "\n".join(errors[:20])
11006+
)
11007+
11008+
# Verify the system recovers after transfers complete: every
11009+
# node must be able to serve a basic read within a reasonable
11010+
# window. wait_until absorbs transient CI slowness while still
11011+
# catching real breakage.
11012+
def all_nodes_healthy():
11013+
for node in self.redpanda.nodes:
11014+
r = self.sr_client.get_subjects(
11015+
hostname=node.account.hostname, timeout=req_timeout
11016+
)
11017+
if r.status_code != 200:
11018+
return False
11019+
return True
11020+
11021+
wait_until(
11022+
all_nodes_healthy,
11023+
timeout_sec=30,
11024+
backoff_sec=2,
11025+
err_msg="Schema registry did not recover on all nodes "
11026+
"after leadership transfers",
11027+
)
11028+
11029+
11030+
class SchemaRegistryRpcTransportStressTest(SchemaRegistryTransportStressTest):
11031+
"""
11032+
RPC transport variant of the leadership transfer stress test.
11033+
"""
11034+
11035+
def __init__(self, context: TestContext):
11036+
super().__init__(
11037+
context,
11038+
extra_rp_conf={"schema_registry_use_rpc": True},
11039+
)
11040+
11041+
11042+
class SchemaRegistryKafkaClientTransportStressTest(SchemaRegistryTransportStressTest):
11043+
"""
11044+
Kafka client transport variant of the leadership transfer stress test.
11045+
"""
11046+
11047+
def __init__(self, context: TestContext):
11048+
super().__init__(
11049+
context,
11050+
extra_rp_conf={"schema_registry_use_rpc": False},
11051+
)

0 commit comments

Comments
 (0)