Fix no offers retry for scheduled runs (#3759)

r4victor · web-flow · commit 6fd1f1bc62c6 · 2026-04-09T15:29:21.000+05:00
diff --git a/src/dstack/_internal/server/background/pipeline_tasks/runs/active.py b/src/dstack/_internal/server/background/pipeline_tasks/runs/active.py
@@ -288,10 +288,11 @@ async def _should_retry_job(
     job_model: JobModel,
 ) -> Optional[timedelta]:
     """
-    Checks if the job should be retried.
-    Returns the current duration of retrying if retry is enabled.
-    Retrying duration is calculated as the time since `last_processed_at`
-    of the latest provisioned submission.
+    Checks if the job should be retried and returns the elapsed retry duration.
+
+    For `no-capacity`, retry is limited by the age of the current run. Once the
+    job has already provisioned, retry is limited by the time since the latest
+    provisioned submission for that job.
     """
     job_spec = get_job_spec(job_model)
     if job_spec.retry is None:
@@ -309,7 +310,13 @@ async def _should_retry_job(
         and last_provisioned is None
         and RetryEvent.NO_CAPACITY in job_spec.retry.on_events
     ):
-        return get_current_datetime() - run_model.submitted_at
+        retry_started_at = run_model.submitted_at
+        if run_model.next_triggered_at is not None:
+            # Scheduled runs keep `next_triggered_at` pointing to the current trigger time while
+            # retrying. Retryable failures go back to PENDING directly, and the terminating worker
+            # advances `next_triggered_at` only when the current execution is over.
+            retry_started_at = run_model.next_triggered_at
+        return get_current_datetime() - retry_started_at
 
     if (
         job_model.termination_reason is not None
diff --git a/src/dstack/_internal/server/background/pipeline_tasks/runs/terminating.py b/src/dstack/_internal/server/background/pipeline_tasks/runs/terminating.py
@@ -29,6 +29,7 @@ class TerminatingRunUpdateMap(ItemUpdateMap, total=False):
     status: RunStatus
     next_triggered_at: Optional[datetime]
     fleet_id: Optional[uuid.UUID]
+    resubmission_attempt: int
 
 
 class TerminatingRunJobUpdateMap(ItemUpdateMap, total=False):
@@ -134,6 +135,7 @@ def _get_run_update_map(run_model: models.RunModel) -> TerminatingRunUpdateMap:
             status=RunStatus.PENDING,
             next_triggered_at=_get_next_triggered_at(run_spec),
             fleet_id=None,
+            resubmission_attempt=0,
         )
     return TerminatingRunUpdateMap(status=termination_reason.to_status())
 
diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_runs/test_active.py b/src/tests/_internal/server/background/pipeline_tasks/test_runs/test_active.py
@@ -17,6 +17,7 @@
     Profile,
     ProfileRetry,
     RetryEvent,
+    Schedule,
     StopCriteria,
 )
 from dstack._internal.core.models.resources import Range
@@ -295,6 +296,104 @@ async def test_retries_no_capacity_replica_and_keeps_service_running(
         assert retried_job.status == JobStatus.SUBMITTED
         assert len(jobs) == 3
 
+    async def test_retries_scheduled_run_no_capacity_from_trigger_time(
+        self, test_db, session: AsyncSession, worker: RunWorker
+    ) -> None:
+        project = await create_project(session=session)
+        user = await create_user(session=session)
+        repo = await create_repo(session=session, project_id=project.id)
+        run_spec = get_run_spec(
+            repo_id=repo.name,
+            profile=Profile(
+                name="default",
+                retry=ProfileRetry(duration=3600, on_events=[RetryEvent.NO_CAPACITY]),
+            ),
+            configuration=TaskConfiguration(
+                commands=["echo hello"],
+                schedule=Schedule(cron="15 * * * *"),
+            ),
+        )
+        trigger_time = get_current_datetime() - timedelta(minutes=5)
+        run = await create_run(
+            session=session,
+            project=project,
+            repo=repo,
+            user=user,
+            run_spec=run_spec,
+            status=RunStatus.SUBMITTED,
+            submitted_at=get_current_datetime() - timedelta(hours=2),
+            next_triggered_at=trigger_time,
+            resubmission_attempt=0,
+        )
+        await create_job(
+            session=session,
+            run=run,
+            status=JobStatus.FAILED,
+            termination_reason=JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY,
+        )
+        lock_run(run)
+        await session.commit()
+
+        with patch(
+            "dstack._internal.server.background.pipeline_tasks.runs.active.get_current_datetime",
+            return_value=trigger_time + timedelta(minutes=10),
+        ):
+            await worker.process(run_to_pipeline_item(run))
+
+        await session.refresh(run)
+        assert run.status == RunStatus.PENDING
+        assert run.resubmission_attempt == 1
+        assert run.lock_token is None
+
+    async def test_terminates_scheduled_run_when_no_capacity_retry_exceeded_from_trigger_time(
+        self, test_db, session: AsyncSession, worker: RunWorker
+    ) -> None:
+        project = await create_project(session=session)
+        user = await create_user(session=session)
+        repo = await create_repo(session=session, project_id=project.id)
+        run_spec = get_run_spec(
+            repo_id=repo.name,
+            profile=Profile(
+                name="default",
+                retry=ProfileRetry(duration=600, on_events=[RetryEvent.NO_CAPACITY]),
+            ),
+            configuration=TaskConfiguration(
+                commands=["echo hello"],
+                schedule=Schedule(cron="15 * * * *"),
+            ),
+        )
+        trigger_time = get_current_datetime() - timedelta(minutes=20)
+        run = await create_run(
+            session=session,
+            project=project,
+            repo=repo,
+            user=user,
+            run_spec=run_spec,
+            status=RunStatus.SUBMITTED,
+            submitted_at=get_current_datetime() - timedelta(hours=2),
+            next_triggered_at=trigger_time,
+            resubmission_attempt=0,
+        )
+        await create_job(
+            session=session,
+            run=run,
+            status=JobStatus.FAILED,
+            termination_reason=JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY,
+        )
+        lock_run(run)
+        await session.commit()
+
+        with patch(
+            "dstack._internal.server.background.pipeline_tasks.runs.active.get_current_datetime",
+            return_value=trigger_time + timedelta(minutes=20),
+        ):
+            await worker.process(run_to_pipeline_item(run))
+
+        await session.refresh(run)
+        assert run.status == RunStatus.TERMINATING
+        assert run.termination_reason == RunTerminationReason.RETRY_LIMIT_EXCEEDED
+        assert run.lock_token is None
+
     async def test_retrying_multinode_replica_terminates_active_sibling_jobs(
         self, test_db, session: AsyncSession, worker: RunWorker
     ) -> None:
diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_runs/test_termination.py b/src/tests/_internal/server/background/pipeline_tasks/test_runs/test_termination.py
@@ -221,6 +221,7 @@ async def test_reschedules_scheduled_run_and_clears_fleet(
         await session.refresh(run)
         assert run.status == RunStatus.PENDING
         assert run.next_triggered_at == datetime(2023, 1, 2, 3, 15, tzinfo=timezone.utc)
+        assert run.resubmission_attempt == 0
         assert run.fleet_id is None
         assert run.lock_token is None
         assert run.lock_expires_at is None