|
17 | 17 | Profile, |
18 | 18 | ProfileRetry, |
19 | 19 | RetryEvent, |
| 20 | + Schedule, |
20 | 21 | StopCriteria, |
21 | 22 | ) |
22 | 23 | from dstack._internal.core.models.resources import Range |
@@ -295,6 +296,104 @@ async def test_retries_no_capacity_replica_and_keeps_service_running( |
295 | 296 | assert retried_job.status == JobStatus.SUBMITTED |
296 | 297 | assert len(jobs) == 3 |
297 | 298 |
|
| 299 | + async def test_retries_scheduled_run_no_capacity_from_trigger_time( |
| 300 | + self, test_db, session: AsyncSession, worker: RunWorker |
| 301 | + ) -> None: |
| 302 | + project = await create_project(session=session) |
| 303 | + user = await create_user(session=session) |
| 304 | + repo = await create_repo(session=session, project_id=project.id) |
| 305 | + run_spec = get_run_spec( |
| 306 | + repo_id=repo.name, |
| 307 | + profile=Profile( |
| 308 | + name="default", |
| 309 | + retry=ProfileRetry(duration=3600, on_events=[RetryEvent.NO_CAPACITY]), |
| 310 | + ), |
| 311 | + configuration=TaskConfiguration( |
| 312 | + commands=["echo hello"], |
| 313 | + schedule=Schedule(cron="15 * * * *"), |
| 314 | + ), |
| 315 | + ) |
| 316 | + trigger_time = get_current_datetime() - timedelta(minutes=5) |
| 317 | + run = await create_run( |
| 318 | + session=session, |
| 319 | + project=project, |
| 320 | + repo=repo, |
| 321 | + user=user, |
| 322 | + run_spec=run_spec, |
| 323 | + status=RunStatus.SUBMITTED, |
| 324 | + submitted_at=get_current_datetime() - timedelta(hours=2), |
| 325 | + next_triggered_at=trigger_time, |
| 326 | + resubmission_attempt=0, |
| 327 | + ) |
| 328 | + await create_job( |
| 329 | + session=session, |
| 330 | + run=run, |
| 331 | + status=JobStatus.FAILED, |
| 332 | + termination_reason=JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY, |
| 333 | + ) |
| 334 | + lock_run(run) |
| 335 | + await session.commit() |
| 336 | + |
| 337 | + with patch( |
| 338 | + "dstack._internal.server.background.pipeline_tasks.runs.active.get_current_datetime", |
| 339 | + return_value=trigger_time + timedelta(minutes=10), |
| 340 | + ): |
| 341 | + await worker.process(run_to_pipeline_item(run)) |
| 342 | + |
| 343 | + await session.refresh(run) |
| 344 | + assert run.status == RunStatus.PENDING |
| 345 | + assert run.resubmission_attempt == 1 |
| 346 | + assert run.lock_token is None |
| 347 | + |
| 348 | + async def test_terminates_scheduled_run_when_no_capacity_retry_exceeded_from_trigger_time( |
| 349 | + self, test_db, session: AsyncSession, worker: RunWorker |
| 350 | + ) -> None: |
| 351 | + project = await create_project(session=session) |
| 352 | + user = await create_user(session=session) |
| 353 | + repo = await create_repo(session=session, project_id=project.id) |
| 354 | + run_spec = get_run_spec( |
| 355 | + repo_id=repo.name, |
| 356 | + profile=Profile( |
| 357 | + name="default", |
| 358 | + retry=ProfileRetry(duration=600, on_events=[RetryEvent.NO_CAPACITY]), |
| 359 | + ), |
| 360 | + configuration=TaskConfiguration( |
| 361 | + commands=["echo hello"], |
| 362 | + schedule=Schedule(cron="15 * * * *"), |
| 363 | + ), |
| 364 | + ) |
| 365 | + trigger_time = get_current_datetime() - timedelta(minutes=20) |
| 366 | + run = await create_run( |
| 367 | + session=session, |
| 368 | + project=project, |
| 369 | + repo=repo, |
| 370 | + user=user, |
| 371 | + run_spec=run_spec, |
| 372 | + status=RunStatus.SUBMITTED, |
| 373 | + submitted_at=get_current_datetime() - timedelta(hours=2), |
| 374 | + next_triggered_at=trigger_time, |
| 375 | + resubmission_attempt=0, |
| 376 | + ) |
| 377 | + await create_job( |
| 378 | + session=session, |
| 379 | + run=run, |
| 380 | + status=JobStatus.FAILED, |
| 381 | + termination_reason=JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY, |
| 382 | + ) |
| 383 | + lock_run(run) |
| 384 | + await session.commit() |
| 385 | + |
| 386 | + with patch( |
| 387 | + "dstack._internal.server.background.pipeline_tasks.runs.active.get_current_datetime", |
| 388 | + return_value=trigger_time + timedelta(minutes=20), |
| 389 | + ): |
| 390 | + await worker.process(run_to_pipeline_item(run)) |
| 391 | + |
| 392 | + await session.refresh(run) |
| 393 | + assert run.status == RunStatus.TERMINATING |
| 394 | + assert run.termination_reason == RunTerminationReason.RETRY_LIMIT_EXCEEDED |
| 395 | + assert run.lock_token is None |
| 396 | + |
298 | 397 | async def test_retrying_multinode_replica_terminates_active_sibling_jobs( |
299 | 398 | self, test_db, session: AsyncSession, worker: RunWorker |
300 | 399 | ) -> None: |
|
0 commit comments