Skip to content

Commit af962a2

Browse files
authored
Merge pull request #865 from ninan-nn/hotfix/hotfix_pool
fix(sdks): fix backoff
2 parents 071f7f2 + 0c99d74 commit af962a2

4 files changed

Lines changed: 29 additions & 5 deletions

File tree

sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/pool/ReconcileState.kt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ import java.time.Instant
2828
internal class ReconcileState(
2929
private val degradedThreshold: Int,
3030
private val backoffBase: Duration = Duration.ofSeconds(1),
31-
private val backoffMax: Duration = Duration.ofSeconds(60),
31+
private val backoffMax: Duration = Duration.ofDays(1),
3232
) {
3333
@Volatile
3434
var failureCount: Int = 0
@@ -63,7 +63,7 @@ internal class ReconcileState(
6363
if (failureCount >= degradedThreshold) {
6464
state = PoolState.DEGRADED
6565
backoffAttempts++
66-
val exponent = backoffAttempts.coerceAtMost(10)
66+
val exponent = backoffAttempts.coerceAtMost(30)
6767
val delaySeconds = backoffBase.seconds * (1L shl exponent)
6868
val delayMs =
6969
minOf(

sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/pool/PoolReconcilerStateTest.kt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,18 @@ class PoolReconcilerStateTest {
4444
assertEquals(3, state.failureCount)
4545
}
4646

47+
@Test
48+
fun `default degraded backoff caps at one day`() {
49+
val state = ReconcileState(degradedThreshold = 1)
50+
51+
repeat(20) { state.recordFailure("boom") }
52+
53+
assertEquals(PoolState.DEGRADED, state.state)
54+
assertEquals(20, state.failureCount)
55+
assertEquals(true, state.isBackoffActive(Instant.now().plus(Duration.ofHours(23))))
56+
assertFalse(state.isBackoffActive(Instant.now().plus(Duration.ofHours(25))))
57+
}
58+
4759
@Test
4860
fun `reconcile create exception increments failure count once per task`() {
4961
val stateStore = InMemoryPoolStateStore()

sdks/sandbox/python/src/opensandbox/_pool_reconciler.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
class ReconcileState:
3333
degraded_threshold: int
3434
backoff_base: timedelta = timedelta(seconds=1)
35-
backoff_max: timedelta = timedelta(seconds=60)
35+
backoff_max: timedelta = timedelta(days=1)
3636
failure_count: int = 0
3737
state: PoolState = PoolState.HEALTHY
3838
last_error: str | None = None
@@ -53,7 +53,7 @@ def record_failure(self, error_message: str | None) -> None:
5353
if self.failure_count >= self.degraded_threshold:
5454
self.state = PoolState.DEGRADED
5555
self.backoff_attempts += 1
56-
exponent = min(self.backoff_attempts, 10)
56+
exponent = min(self.backoff_attempts, 30)
5757
delay = min(
5858
self.backoff_base.total_seconds() * (1 << exponent),
5959
self.backoff_max.total_seconds(),

sdks/sandbox/python/tests/test_pool_sync.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,13 @@
22

33
import threading
44
import time
5-
from datetime import timedelta
5+
from datetime import datetime, timedelta, timezone
66
from typing import Any, cast
77

88
import httpx
99
import pytest
1010

11+
from opensandbox._pool_reconciler import ReconcileState
1112
from opensandbox.config.connection_sync import ConnectionConfigSync
1213
from opensandbox.exceptions import (
1314
PoolAcquireFailedException,
@@ -19,6 +20,17 @@
1920
from opensandbox.sync.pool import SandboxPoolSync
2021

2122

23+
def test_degraded_backoff_caps_at_one_day() -> None:
24+
state = ReconcileState(degraded_threshold=1)
25+
26+
for _ in range(20):
27+
state.record_failure("boom")
28+
29+
assert state.failure_count == 20
30+
assert state.is_backoff_active(datetime.now(timezone.utc) + timedelta(hours=23))
31+
assert not state.is_backoff_active(datetime.now(timezone.utc) + timedelta(hours=25))
32+
33+
2234
def test_acquire_fail_fast_empty_raises_pool_empty() -> None:
2335
pool = _create_pool(max_idle=0)
2436
pool.start()

0 commit comments

Comments
 (0)