Skip to content

Commit e423964

Browse files
Add configurable cluster startup timeout and verbose output (#242)
* Add configurable cluster startup timeout and verbose output - Add --cluster-start-timeout CLI argument (default 40 seconds) - Add verbose progress output during cluster startup: - Show shard startup progress (Started X/Y shards) - Show cluster topology configuration progress - Show cluster wait progress with shard OK/slots agreement counts - Use CLUSTER ADDSLOTSRANGE instead of ADDSLOTS for efficiency - Add error logging for slot assignment and shard startup failures * address PR review * validate cluster-start-timeout at argument parsing time * remove redundant validation from waitCluster * fix: only pass clusterStartTimeout to ClusterEnv
1 parent 2bb4f41 commit e423964

3 files changed

Lines changed: 60 additions & 12 deletions

File tree

RLTest/__main__.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,11 @@ def do_normal_conn(self, line):
148148
'--cluster_node_timeout', default=5000,
149149
help='sets the node timeout on cluster in milliseconds')
150150

151+
parser.add_argument(
152+
'--cluster-start-timeout', default=40, type=int,
153+
help='timeout in seconds to wait for cluster to be ready (default 40 seconds). '
154+
'Increase for large shard counts (e.g., 99 shards).')
155+
151156
parser.add_argument(
152157
'--cluster_credentials',
153158
help='enterprise cluster cluster_credentials "username:password", relevent only when running with cluster_existing-env')
@@ -538,6 +543,9 @@ def __init__(self):
538543
Defaults.tls_passphrase = self.args.tls_passphrase
539544
Defaults.oss_password = self.args.oss_password
540545
Defaults.cluster_node_timeout = self.args.cluster_node_timeout
546+
Defaults.cluster_start_timeout = self.args.cluster_start_timeout
547+
if Defaults.cluster_start_timeout < 5:
548+
raise Exception('--cluster-start-timeout must be at least 5 seconds')
541549
Defaults.enable_debug_command = True if self.args.allow_unsafe else self.args.enable_debug_command
542550
Defaults.enable_protected_configs = True if self.args.allow_unsafe else self.args.enable_protected_configs
543551
Defaults.enable_module_command = True if self.args.allow_unsafe else self.args.enable_module_command

RLTest/env.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ class Defaults:
142142
randomize_ports = False
143143
oss_password = None
144144
cluster_node_timeout = None
145+
cluster_start_timeout = 40
145146
curr_test_name = None
146147
port = 6379
147148
enable_debug_command = False
@@ -315,6 +316,7 @@ def getEnvByName(self):
315316
**kwargs)
316317
if self.env == 'oss-cluster':
317318
kwargs['password'] = Defaults.oss_password if self.password is None else self.password
319+
kwargs['clusterStartTimeout'] = Defaults.cluster_start_timeout
318320
return ClusterEnv(shardsCount=self.shardsCount, redisBinaryPath=self.redisBinaryPath,
319321
outputFilesFormat='%s-' + '%s-oss-cluster' % test_fname,
320322
randomizePorts=Defaults.randomize_ports,
@@ -369,7 +371,7 @@ def getEnvKwargs(self):
369371
'terminateRetries': self.terminateRetries,
370372
'terminateRetrySecs': self.terminateRetrySecs,
371373
'redisConfigFile': self.redisConfigFile,
372-
'dualTLS': self.dualTLS
374+
'dualTLS': self.dualTLS,
373375
}
374376
return kwargs
375377

RLTest/redis_cluster.py

Lines changed: 49 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66
import time
77
from RLTest.utils import Colors
88

9+
# Interval in seconds between status updates during cluster wait
10+
CLUSTER_STATUS_INTERVAL_SEC = 5
11+
912

1013
class ClusterEnv(object):
1114
def __init__(self, **kwargs):
@@ -23,6 +26,7 @@ def __init__(self, **kwargs):
2326
self.protocol = kwargs.get('protocol', 2)
2427
self.terminateRetries = kwargs.get('terminateRetries', None)
2528
self.terminateRetrySecs = kwargs.get('terminateRetrySecs', None)
29+
self.clusterStartTimeout = kwargs.pop('clusterStartTimeout', 40)
2630
startPort = kwargs.pop('port', 10000)
2731
totalRedises = self.shardsCount * (2 if useSlaves else 1)
2832
randomizePorts = kwargs.pop('randomizePorts', False)
@@ -50,7 +54,8 @@ def getInformationBeforeDispose(self):
5054
def getInformationAfterDispose(self):
5155
return [shard.getInformationAfterDispose() for shard in self.shards]
5256

53-
def _agreeOk(self):
57+
def _countOk(self):
58+
"""Returns count of shards reporting cluster_state:ok"""
5459
ok = 0
5560
for shard in self.shards:
5661
con = shard.getConnection()
@@ -61,9 +66,10 @@ def _agreeOk(self):
6166
continue
6267
if 'cluster_state:ok' in str(status):
6368
ok += 1
64-
return ok == len(self.shards)
69+
return ok
6570

66-
def _agreeSlots(self):
71+
def _countAgreeSlots(self):
72+
"""Returns count of shards that agree on slots view"""
6773
ok = 0
6874
first_view = None
6975
for shard in self.shards:
@@ -77,35 +83,63 @@ def _agreeSlots(self):
7783
first_view = slots_view
7884
if slots_view == first_view:
7985
ok += 1
80-
return ok == len(self.shards)
86+
return ok
8187

82-
def waitCluster(self, timeout_sec=40):
88+
def waitCluster(self, timeout_sec=40, verbose=True):
8389
st = time.time()
90+
last_status_time = st
91+
total_shards = len(self.shards)
92+
93+
if verbose:
94+
print(Colors.Yellow('Waiting for cluster to be ready (timeout: %d seconds, %d shards)...' %
95+
(timeout_sec, total_shards)))
8496

8597
while st + timeout_sec > time.time():
86-
if self._agreeOk() and self._agreeSlots():
98+
ok_count = self._countOk()
99+
slots_count = self._countAgreeSlots()
100+
101+
if ok_count == total_shards and slots_count == total_shards:
102+
elapsed = time.time() - st
103+
if verbose:
104+
print(Colors.Green('Cluster is ready after %.1f seconds' % elapsed))
87105
for shard in self.shards:
88106
try:
89107
shard.getConnection().execute_command('SEARCH.CLUSTERREFRESH')
90108
except Exception:
91109
pass
92110
return
93111

112+
# Print periodic status update
113+
now = time.time()
114+
if verbose and (now - last_status_time) >= CLUSTER_STATUS_INTERVAL_SEC:
115+
elapsed = now - st
116+
print(Colors.Yellow(' Cluster wait: %.1fs elapsed - %d/%d shards OK, %d/%d agree on slots...' %
117+
(elapsed, ok_count, total_shards, slots_count, total_shards)))
118+
last_status_time = now
119+
94120
time.sleep(0.1)
95121
raise RuntimeError(
96122
"Cluster OK wait loop timed out after %s seconds" % timeout_sec)
97123

98124
def startEnv(self, masters=True, slaves=True):
99125
if self.envIsUp == True:
100126
return # env is already up
127+
128+
total_shards = len(self.shards)
129+
print(Colors.Yellow('Starting cluster with %d shards...' % total_shards))
130+
101131
try:
102-
for shard in self.shards:
132+
for i, shard in enumerate(self.shards):
103133
shard.startEnv(masters, slaves)
104-
except Exception:
134+
print(Colors.Yellow(' Started shard %d/%d' % (i + 1, total_shards)))
135+
except Exception as e:
136+
print(Colors.Bred('Error starting shard %d: %s' % (i + 1, str(e))))
137+
print(Colors.Bred('Stopping all shards...'))
105138
for shard in self.shards:
106139
shard.stopEnv()
107140
raise
108141

142+
print(Colors.Yellow('Configuring cluster topology...'))
109143
slots_per_node = int(16384 / len(self.shards)) + 1
110144
for i, shard in enumerate(self.shards):
111145
con = shard.getConnection()
@@ -121,10 +155,14 @@ def startEnv(self, masters=True, slaves=True):
121155
try:
122156
con.execute_command('CLUSTER', 'ADDSLOTS', *(str(x)
123157
for x in range(start_slot, end_slot)))
124-
except Exception:
125-
pass
158+
except Exception as e:
159+
print(Colors.Bred(' Error assigning slots %d-%d to shard %d: %s' %
160+
(start_slot, end_slot - 1, i + 1, str(e))))
126161

127-
self.waitCluster()
162+
print(Colors.Yellow(' Configured shard %d/%d (slots %d-%d)' %
163+
(i + 1, total_shards, start_slot, min(end_slot - 1, 16383))))
164+
165+
self.waitCluster(timeout_sec=self.clusterStartTimeout)
128166
self.envIsUp = True
129167
self.envIsHealthy = True
130168

0 commit comments

Comments
 (0)