Skip to content

Commit 06a5f45

Browse files
committed
feat(DISET): auto-restart service after configurable max throttle duration
When a service is stuck in throttle mode (all threads blocked, queue full), it cannot recover without external intervention. Add a configurable MaxThrottleDuration CS option (default: 0 = disabled) that triggers a process exit after the specified number of seconds of continuous throttling. The process supervisor (e.g. runsv) then restarts the service cleanly, clearing all stuck state. When enabled, a FATAL log message is emitted before exit with full queue/thread diagnostics for post-mortem analysis.
1 parent 0bcc9e1 commit 06a5f45

File tree

2 files changed

+29
-2
lines changed

2 files changed

+29
-2
lines changed

src/DIRAC/Core/DISET/ServiceReactor.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -244,10 +244,25 @@ def __acceptIncomingConnection(self, svcName=False):
244244
duration = now - throttleStartedAt
245245
diag = self.__services[svcName].throttleDiagnostics()
246246
gLogger.warn(
247-
f"Service {svcName} still throttling after {duration:.0f}s",
248-
f"queue={diag['queue']}/{diag['maxQueue']}, " f"threads={diag['threads']}/{diag['maxThreads']}",
247+
f"Service {svcName} still throttling",
248+
f"duration={duration:.0f}s, queue={diag['queue']}/{diag['maxQueue']}, "
249+
f"threads={diag['threads']}/{diag['maxThreads']}",
249250
)
250251
lastThrottleLog = now
252+
# Check if throttle has exceeded the maximum allowed duration
253+
maxThrottleDuration = self.__services[svcName].getConfig().getMaxThrottleDuration()
254+
if maxThrottleDuration > 0 and (now - throttleStartedAt) > maxThrottleDuration:
255+
diag = self.__services[svcName].throttleDiagnostics()
256+
gLogger.fatal(
257+
f"Service {svcName} stuck in throttle, initiating process restart",
258+
f"duration={now - throttleStartedAt:.0f}s (limit: {maxThrottleDuration}s), "
259+
f"queue={diag['queue']}/{diag['maxQueue']}, "
260+
f"threads={diag['threads']}/{diag['maxThreads']}",
261+
)
262+
clientTransport.close()
263+
self.__alive = False
264+
return
265+
gLogger.warn("Rejecting client due to throttling", str(clientTransport.getRemoteAddress()))
251266
clientTransport.close()
252267
time.sleep(THROTTLE_SERVICE_SLEEP_SECONDS)
253268
continue

src/DIRAC/Core/DISET/private/ServiceConfiguration.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,18 @@ def getURL(self):
121121
self.setURL(serviceURL)
122122
return serviceURL
123123

124+
def getMaxThrottleDuration(self):
125+
"""Maximum seconds a service can remain in throttle mode before triggering a restart.
126+
127+
Set to 0 to disable auto-restart (default).
128+
When the throttle duration exceeds this value, the service process exits
129+
to allow the process supervisor (e.g. runsv) to restart it cleanly.
130+
"""
131+
try:
132+
return int(self.getOption("MaxThrottleDuration"))
133+
except Exception:
134+
return 0
135+
124136
def getContextLifeTime(self):
125137
optionValue = self.getOption("ContextLifeTime")
126138
try:

0 commit comments

Comments
 (0)