Skip to content

Commit b77fad9

Browse files
committed
test(engine): add regression test for alarm-during-sleep-transition race
1 parent 158603e commit b77fad9

1 file changed

Lines changed: 68 additions & 0 deletions

File tree

engine/packages/engine/tests/runner/actors_alarm.rs

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1451,3 +1451,71 @@ fn many_actors_same_alarm_time() {
14511451
);
14521452
});
14531453
}
1454+
1455+
/// Regression test for the alarm-during-sleep-transition race.
1456+
///
1457+
/// Scenario: an actor schedules an alarm in the near future, then immediately
1458+
/// sends a sleep intent. The stop flow may take long enough that the alarm
1459+
/// becomes overdue while `handle_stopped` is processing `Decision::Sleep`.
1460+
///
1461+
/// Before the fix in `actor2/runtime.rs`, this window cleared `state.alarm_ts`
1462+
/// without handling the overdue alarm, so the scheduled work was silently
1463+
/// dropped and the actor went to sleep. The handler would never run.
1464+
///
1465+
/// After the fix, `Decision::Sleep` detects the overdue alarm, reallocates the
1466+
/// actor, and bumps the generation so the alarm handler runs. This test
1467+
/// verifies that path by setting a very short alarm offset and checking the
1468+
/// actor wakes to generation 1 instead of sleeping forever.
1469+
///
1470+
/// Expected: the alarm triggers via reallocation. If the fix is reverted, the
1471+
/// alarm will never trigger and this test will time out waiting for the wake.
1472+
#[test]
1473+
#[ignore = "captures alarm-during-sleep-transition race; times out if the overdue-alarm reallocation path regresses"]
1474+
fn alarm_overdue_during_sleep_transition_fires_via_reallocation() {
1475+
common::run(
1476+
common::TestOpts::new(1).with_timeout(15),
1477+
|ctx| async move {
1478+
let (namespace, _) = common::setup_test_namespace(ctx.leader_dc()).await;
1479+
1480+
let (ready_tx, ready_rx) = tokio::sync::oneshot::channel();
1481+
let ready_tx = Arc::new(Mutex::new(Some(ready_tx)));
1482+
1483+
let runner = common::setup_runner(ctx.leader_dc(), &namespace, |builder| {
1484+
builder.with_actor_behavior("alarm-actor", move |_| {
1485+
let ready_tx = ready_tx.clone();
1486+
// 100ms offset leaves enough time to dispatch the sleep intent
1487+
// but is short enough that the alarm is near-overdue by the
1488+
// time the workflow reaches `Decision::Sleep`.
1489+
Box::new(AlarmAndSleepOnceActor::new(100, ready_tx))
1490+
})
1491+
})
1492+
.await;
1493+
1494+
let res = common::create_actor(
1495+
ctx.leader_dc().guard_port(),
1496+
&namespace,
1497+
"alarm-actor",
1498+
runner.name(),
1499+
rivet_types::actors::CrashPolicy::Destroy,
1500+
)
1501+
.await;
1502+
1503+
let actor_id = res.actor.actor_id.to_string();
1504+
1505+
ready_rx.await.expect("actor should send ready signal");
1506+
1507+
let lifecycle_rx = runner.subscribe_lifecycle_events();
1508+
1509+
// If the overdue alarm was dropped, the actor would enter sleep and
1510+
// never wake. A successful reallocation wakes the actor at generation 1.
1511+
wait_for_actor_wake_from_alarm(lifecycle_rx, &actor_id, 1, 10)
1512+
.await
1513+
.expect(
1514+
"actor should wake from the overdue alarm via reallocation; \
1515+
if this times out, the `Decision::Sleep` overdue-alarm path was dropped",
1516+
);
1517+
1518+
tracing::info!(?actor_id, "overdue alarm fired via reallocation");
1519+
},
1520+
);
1521+
}

0 commit comments

Comments
 (0)