Skip to content

Commit 5a5b134

Browse files
authored
Retry on VOLUME_ERROR and INSTANCE_UNREACHABLE (#2805)
Also refactor so that it is less likely that we forget to associate new termination reasons with retry events.
1 parent 6157124 commit 5a5b134

3 files changed

Lines changed: 26 additions & 20 deletions

File tree

src/dstack/_internal/core/models/runs.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,19 @@ def to_status(self) -> JobStatus:
148148
}
149149
return mapping[self]
150150

151+
def to_retry_event(self) -> Optional[RetryEvent]:
152+
"""
153+
Returns:
154+
the retry event this termination reason triggers
155+
or None if this termination reason should not be retried
156+
"""
157+
mapping = {
158+
self.FAILED_TO_START_DUE_TO_NO_CAPACITY: RetryEvent.NO_CAPACITY,
159+
self.INTERRUPTED_BY_NO_CAPACITY: RetryEvent.INTERRUPTION,
160+
}
161+
default = RetryEvent.ERROR if self.to_status() == JobStatus.FAILED else None
162+
return mapping.get(self, default)
163+
151164

152165
class Requirements(CoreModel):
153166
# TODO: Make requirements' fields required

src/dstack/_internal/server/background/tasks/process_runs.py

Lines changed: 5 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -393,7 +393,8 @@ def _should_retry_job(run: Run, job: Job, job_model: JobModel) -> Optional[datet
393393
break
394394

395395
if (
396-
job_model.termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
396+
job_model.termination_reason is not None
397+
and job_model.termination_reason.to_retry_event() == RetryEvent.NO_CAPACITY
397398
and last_provisioned_submission is None
398399
and RetryEvent.NO_CAPACITY in job.job_spec.retry.on_events
399400
):
@@ -403,24 +404,9 @@ def _should_retry_job(run: Run, job: Job, job_model: JobModel) -> Optional[datet
403404
return None
404405

405406
if (
406-
last_provisioned_submission.termination_reason
407-
== JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
408-
and RetryEvent.INTERRUPTION in job.job_spec.retry.on_events
409-
):
410-
return common.get_current_datetime() - last_provisioned_submission.last_processed_at
411-
412-
if (
413-
last_provisioned_submission.termination_reason
414-
in [
415-
JobTerminationReason.CONTAINER_EXITED_WITH_ERROR,
416-
JobTerminationReason.CREATING_CONTAINER_ERROR,
417-
JobTerminationReason.EXECUTOR_ERROR,
418-
JobTerminationReason.GATEWAY_ERROR,
419-
JobTerminationReason.WAITING_INSTANCE_LIMIT_EXCEEDED,
420-
JobTerminationReason.WAITING_RUNNER_LIMIT_EXCEEDED,
421-
JobTerminationReason.PORTS_BINDING_FAILED,
422-
]
423-
and RetryEvent.ERROR in job.job_spec.retry.on_events
407+
last_provisioned_submission.termination_reason is not None
408+
and last_provisioned_submission.termination_reason.to_retry_event()
409+
in job.job_spec.retry.on_events
424410
):
425411
return common.get_current_datetime() - last_provisioned_submission.last_processed_at
426412

src/tests/_internal/core/models/test_runs.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from dstack._internal.core.models.profiles import RetryEvent
12
from dstack._internal.core.models.runs import (
23
JobStatus,
34
JobSubmission,
@@ -20,12 +21,18 @@ def test_run_termination_reason_to_status_works_with_all_enum_variants():
2021
assert isinstance(run_status, RunStatus)
2122

2223

23-
def test_job_termination_reason_to_status_works_with_all_enum_varians():
24+
def test_job_termination_reason_to_status_works_with_all_enum_variants():
2425
for job_termination_reason in JobTerminationReason:
2526
job_status = job_termination_reason.to_status()
2627
assert isinstance(job_status, JobStatus)
2728

2829

30+
def test_job_termination_reason_to_retry_event_works_with_all_enum_variants():
31+
for job_termination_reason in JobTerminationReason:
32+
retry_event = job_termination_reason.to_retry_event()
33+
assert retry_event is None or isinstance(retry_event, RetryEvent)
34+
35+
2936
# Will fail if JobTerminationReason value is added without updaing JobSubmission._get_error
3037
def test_get_error_returns_expected_messages():
3138
no_error_reasons = [

0 commit comments

Comments
 (0)