Skip to content

Commit e1be3d6

Browse files
committed
fix(pegboard): align drain grace defaults with stop threshold
1 parent d612792 commit e1be3d6

13 files changed

Lines changed: 76 additions & 18 deletions

File tree

engine/packages/api-types/src/namespaces/runner_configs.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ impl Into<rivet_types::runner_configs::RunnerConfig> for RunnerConfig {
6868
request_lifespan,
6969
max_concurrent_actors: max_concurrent_actors.unwrap_or(max_runners as u64),
7070
// Default to deprecated config value (config.pegboard.serverless_drain_grace_period)
71-
drain_grace_period: drain_grace_period.unwrap_or(10),
71+
drain_grace_period: drain_grace_period.unwrap_or(30 * 60),
7272
slots_per_runner,
7373
min_runners: min_runners.unwrap_or_default(),
7474
max_runners,

engine/packages/config/src/config/pegboard.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ impl Pegboard {
173173
/// When changing this default, update
174174
/// website/src/content/docs/actors/versions.mdx (SIGTERM Handling section).
175175
pub fn actor_stop_threshold(&self) -> i64 {
176-
self.actor_stop_threshold.unwrap_or(30_000)
176+
self.actor_stop_threshold.unwrap_or(30 * 60 * 1000)
177177
}
178178

179179
pub fn actor_retry_duration_threshold(&self) -> i64 {

engine/packages/engine/tests/runner/api_runner_configs_upsert.rs

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -669,3 +669,49 @@ fn upsert_runner_config_serverless_slots_per_runner_zero() {
669669
);
670670
});
671671
}
672+
673+
#[test]
674+
fn upsert_runner_config_serverless_drain_grace_period_exceeds_actor_stop_threshold() {
675+
common::run(common::TestOpts::new(1), |ctx| async move {
676+
let (namespace, _) = common::setup_test_namespace(ctx.leader_dc()).await;
677+
678+
let runner_name = "long-drain-runner";
679+
let mut datacenters = HashMap::new();
680+
datacenters.insert(
681+
"dc-1".to_string(),
682+
rivet_api_types::namespaces::runner_configs::RunnerConfig {
683+
kind: rivet_api_types::namespaces::runner_configs::RunnerConfigKind::Serverless {
684+
url: "http://example.com".to_string(),
685+
headers: None,
686+
request_lifespan: 30,
687+
max_concurrent_actors: Some(5),
688+
drain_grace_period: Some(30 * 60 + 1),
689+
slots_per_runner: 1,
690+
min_runners: Some(1),
691+
max_runners: 5,
692+
runners_margin: Some(2),
693+
metadata_poll_interval: None,
694+
},
695+
metadata: None,
696+
drain_on_version_upgrade: true,
697+
},
698+
);
699+
700+
let result = common::api::public::runner_configs_upsert(
701+
ctx.leader_dc().guard_port(),
702+
rivet_api_peer::runner_configs::UpsertPath {
703+
runner_name: runner_name.to_string(),
704+
},
705+
rivet_api_peer::runner_configs::UpsertQuery {
706+
namespace: namespace.clone(),
707+
},
708+
rivet_api_public::runner_configs::upsert::UpsertRequest { datacenters },
709+
)
710+
.await;
711+
712+
assert!(
713+
result.is_err(),
714+
"Upsert should fail when drain_grace_period exceeds actor_stop_threshold"
715+
);
716+
});
717+
}

engine/packages/pegboard/src/ops/runner_config/upsert.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ pub async fn pegboard_runner_config_upsert(ctx: &OperationCtx, input: &Input) ->
2929
RunnerConfigKind::Serverless {
3030
url,
3131
headers,
32+
drain_grace_period,
3233
slots_per_runner,
3334
..
3435
} => {
@@ -79,6 +80,17 @@ pub async fn pegboard_runner_config_upsert(ctx: &OperationCtx, input: &Input) ->
7980
}
8081
.build());
8182
}
83+
84+
let actor_stop_threshold_ms = ctx.config().pegboard().actor_stop_threshold();
85+
let drain_grace_period_ms = i64::from(*drain_grace_period) * 1000;
86+
if drain_grace_period_ms > actor_stop_threshold_ms {
87+
return Err(errors::RunnerConfig::Invalid {
88+
reason: format!(
89+
"`drain_grace_period` cannot be greater than `actor_stop_threshold` ({drain_grace_period_ms}ms > {actor_stop_threshold_ms}ms)"
90+
),
91+
}
92+
.build());
93+
}
8294
}
8395
}
8496

engine/sdks/rust/data/src/versioned/namespace_runner_config.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,7 @@ impl NamespaceRunnerConfig {
309309
// Default to max_runners for v4 -> v5 migration
310310
max_concurrent_actors: serverless.max_runners as u64,
311311
// Default to deprecated config value (config.pegboard.serverless_drain_grace_period)
312-
drain_grace_period: 10,
312+
drain_grace_period: 30 * 60,
313313
slots_per_runner: serverless.slots_per_runner,
314314
min_runners: serverless.min_runners,
315315
max_runners: serverless.max_runners,

rivetkit-typescript/packages/rivetkit/src/serverless/configure.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ export async function configureServerlessPool(
6262
serverless: {
6363
url: customConfig.url,
6464
headers,
65-
request_lifespan: customConfig.requestLifespan ?? 15 * 60,
65+
request_lifespan: customConfig.requestLifespan ?? 60 * 60,
6666
drain_grace_period: customConfig.drainGracePeriod,
6767
metadata_poll_interval:
6868
customConfig.metadataPollInterval ?? 1000,

rivetkit-typescript/packages/rivetkit/tests/platforms/shared-platform-harness.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -497,7 +497,7 @@ export async function createPlatformServerlessRunner({
497497
serverless: {
498498
url: serverlessUrl,
499499
headers: headers ?? {},
500-
request_lifespan: requestLifespan ?? 15 * 60,
500+
request_lifespan: requestLifespan ?? 60 * 60,
501501
drain_grace_period: drainGracePeriod,
502502
metadata_poll_interval: metadataPollInterval ?? 1_000,
503503
max_runners: maxRunners ?? 100_000,

website/src/content/docs/actors/limits.mdx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -138,15 +138,15 @@ These timeouts control how actors are shut down when a serverless request reache
138138

139139
| Name | Soft Limit | Hard Limit | Description |
140140
|------|------------|------------|-------------|
141-
| Request lifespan | 900 seconds (15 min) || Total lifespan of a serverless request before drain begins. Configurable via `requestLifespan` in [`configurePool`](/docs/connect/registry-configuration). |
141+
| Request lifespan | 3600 seconds (60 min) || Total lifespan of a serverless request before drain begins. Configurable via `requestLifespan` in [`configurePool`](/docs/connect/registry-configuration). |
142142
| Serverless drain grace period || 10 seconds | Time reserved at the end of a request for actors to stop gracefully. Configurable via [engine config](/docs/self-hosting/configuration) (`pegboard.serverless_drain_grace_period`). |
143143

144144
### Actor Lifecycle
145145

146146
| Name | Soft Limit | Hard Limit | Description |
147147
|------|------------|------------|-------------|
148148
| Actor start threshold || 30 seconds | Maximum time for an actor to start before it is considered lost and rescheduled. |
149-
| Actor stop threshold || 30 seconds | Maximum time for an actor to stop before it is considered lost. |
149+
| Actor stop threshold || 30 minutes | Maximum time for an actor to stop before it is considered lost. |
150150

151151
## Increasing Limits
152152

website/src/content/docs/actors/versions.mdx

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ The `drainOnVersionUpgrade` option controls whether old actors are stopped when
215215
| Value | Behavior |
216216
|-------|----------|
217217
| `false` (default in [runner mode](/docs/general/runtime-modes)) | Old actors continue running. New actors go to new version. Versions coexist. |
218-
| `true` (default in [serverless mode](/docs/general/runtime-modes)) | Old actors receive stop signal and have 30s to finish gracefully. |
218+
| `true` (default in [serverless mode](/docs/general/runtime-modes)) | Old actors receive stop signal and have 30m to finish gracefully. |
219219

220220
## Upgrading Actor State
221221

@@ -343,11 +343,11 @@ When a runner process receives SIGTERM, it gracefully stops all actors before ex
343343

344344
- Each actor's `onSleep` hook is called, giving it time to save state
345345
- Actors are rescheduled to other available runners
346-
- The runner waits up to **120 seconds** for all actors to finish stopping
346+
- The runner waits up to **30 minutes** for all actors to finish stopping
347347
- If the process is force-killed before actors finish (e.g. SIGKILL), actors are rescheduled with a crash backoff penalty instead of a clean handoff
348348

349349
<Note>
350-
Ensure your platform's shutdown grace period is at least **130 seconds** to give actors time to stop cleanly.
350+
Ensure your platform's shutdown grace period is at least **30 minutes** to give actors time to stop cleanly.
351351
</Note>
352352

353353
### Shutdown Timeouts
@@ -356,7 +356,7 @@ Several timeouts control how long each part of the shutdown process can take:
356356

357357
| Timeout | Default | Description | Configuration |
358358
|---------|---------|-------------|---------------|
359-
| `actor_stop_threshold` | 30s | Engine-side limit on how long each actor has to stop before being marked lost | [Engine config](/docs/self-hosting/configuration) (`pegboard.actor_stop_threshold`) |
359+
| `actor_stop_threshold` | 30m | Engine-side limit on how long each actor has to stop before being marked lost | [Engine config](/docs/self-hosting/configuration) (`pegboard.actor_stop_threshold`) |
360360
| `sleepGracePeriod` | 15s | Total graceful sleep budget for `onSleep`, `waitUntil`, `keepAwake`, and async raw WebSocket handlers | [Actor options](/docs/actors/lifecycle#options) |
361361
| `runner_lost_threshold` | 15s | Fallback detection if the runner dies without graceful shutdown | [Engine config](/docs/self-hosting/configuration) (`pegboard.runner_lost_threshold`) |
362362

website/src/content/docs/connect/kubernetes.mdx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,9 +79,9 @@ spec:
7979
app: rivetkit-app
8080
spec:
8181
# Allow enough time for actors to gracefully stop on SIGTERM.
82-
# The runner waits up to 120s for actors to finish; 130s provides buffer.
82+
# The runner waits up to 30m for actors to finish.
8383
# See: /docs/actors/versions#graceful-shutdown-sigterm
84-
terminationGracePeriodSeconds: 130
84+
terminationGracePeriodSeconds: 1800
8585
containers:
8686
- name: rivetkit-app
8787
image: registry.example.com/your-team/rivetkit-app:latest

0 commit comments

Comments
 (0)