Skip to content

Commit da9317a

Browse files
[Internal]: Replace Instance.termination_reason values with codes #3182
1 parent 65f8d48 commit da9317a

8 files changed

Lines changed: 93 additions & 22 deletions

File tree

src/dstack/_internal/core/models/instances.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,18 @@ def finished_statuses(cls) -> List["InstanceStatus"]:
216216
return [cls.TERMINATING, cls.TERMINATED]
217217

218218

219+
class InstanceTerminationReason(str, Enum):
220+
IDLE_TIMEOUT = "idle_timeout"
221+
PROOVISIONING_TIMEOUT = "provisioning_timeout"
222+
ERROR = "error"
223+
JOB_FINISHED = "job_finished"
224+
TERMINATION_TIMEOUT = "termination_timeout"
225+
STARTING_TIMEOUT = "starting_timeout"
226+
NO_OFFERS = "no_offers"
227+
MASTER_FAILED = "master_failed"
228+
NO_BALANCE = "no_balance"
229+
230+
219231
class Instance(CoreModel):
220232
id: UUID
221233
project_name: str
@@ -231,6 +243,7 @@ class Instance(CoreModel):
231243
unreachable: bool = False
232244
health_status: HealthStatus = HealthStatus.HEALTHY
233245
termination_reason: Optional[str] = None
246+
termination_reason_message: Optional[str] = None
234247
created: datetime.datetime
235248
region: Optional[str] = None
236249
availability_zone: Optional[str] = None

src/dstack/_internal/core/models/runs.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ class JobTerminationReason(str, Enum):
138138
TERMINATED_BY_SERVER = "terminated_by_server"
139139
INACTIVITY_DURATION_EXCEEDED = "inactivity_duration_exceeded"
140140
TERMINATED_DUE_TO_UTILIZATION_POLICY = "terminated_due_to_utilization_policy"
141+
NO_BALANCE = "no_balance"
141142
# Set by the runner
142143
CONTAINER_EXITED_WITH_ERROR = "container_exited_with_error"
143144
PORTS_BINDING_FAILED = "ports_binding_failed"
@@ -161,6 +162,7 @@ def to_status(self) -> JobStatus:
161162
self.TERMINATED_BY_SERVER: JobStatus.TERMINATED,
162163
self.INACTIVITY_DURATION_EXCEEDED: JobStatus.TERMINATED,
163164
self.TERMINATED_DUE_TO_UTILIZATION_POLICY: JobStatus.TERMINATED,
165+
self.NO_BALANCE: JobStatus.TERMINATED,
164166
self.CONTAINER_EXITED_WITH_ERROR: JobStatus.FAILED,
165167
self.PORTS_BINDING_FAILED: JobStatus.FAILED,
166168
self.CREATING_CONTAINER_ERROR: JobStatus.FAILED,

src/dstack/_internal/server/background/tasks/process_instances.py

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
InstanceOfferWithAvailability,
4646
InstanceRuntime,
4747
InstanceStatus,
48+
InstanceTerminationReason,
4849
RemoteConnectionInfo,
4950
SSHKey,
5051
)
@@ -240,7 +241,7 @@ def _check_and_mark_terminating_if_idle_duration_expired(instance: InstanceModel
240241
delta = datetime.timedelta(seconds=idle_seconds)
241242
if idle_duration > delta:
242243
instance.status = InstanceStatus.TERMINATING
243-
instance.termination_reason = "Idle timeout"
244+
instance.termination_reason = InstanceTerminationReason.IDLE_TIMEOUT.value
244245
logger.info(
245246
"Instance %s idle duration expired: idle time %ss. Terminating",
246247
instance.name,
@@ -262,7 +263,7 @@ async def _add_remote(instance: InstanceModel) -> None:
262263
retry_duration_deadline = instance.created_at + timedelta(seconds=PROVISIONING_TIMEOUT_SECONDS)
263264
if retry_duration_deadline < get_current_datetime():
264265
instance.status = InstanceStatus.TERMINATED
265-
instance.termination_reason = "Provisioning timeout expired"
266+
instance.termination_reason = InstanceTerminationReason.PROOVISIONING_TIMEOUT.value
266267
logger.warning(
267268
"Failed to start instance %s in %d seconds. Terminating...",
268269
instance.name,
@@ -285,7 +286,8 @@ async def _add_remote(instance: InstanceModel) -> None:
285286
ssh_proxy_pkeys = None
286287
except (ValueError, PasswordRequiredException):
287288
instance.status = InstanceStatus.TERMINATED
288-
instance.termination_reason = "Unsupported private SSH key type"
289+
instance.termination_reason = InstanceTerminationReason.ERROR.value
290+
instance.termination_reason_message = "Unsupported private SSH key type"
289291
logger.warning(
290292
"Failed to add instance %s: unsupported private SSH key type",
291293
instance.name,
@@ -343,7 +345,10 @@ async def _add_remote(instance: InstanceModel) -> None:
343345
)
344346
if instance_network is not None and internal_ip is None:
345347
instance.status = InstanceStatus.TERMINATED
346-
instance.termination_reason = "Failed to locate internal IP address on the given network"
348+
instance.termination_reason = InstanceTerminationReason.ERROR.value
349+
instance.termination_reason_message = (
350+
"Failed to locate internal IP address on the given network"
351+
)
347352
logger.warning(
348353
"Failed to add instance %s: failed to locate internal IP address on the given network",
349354
instance.name,
@@ -356,7 +361,8 @@ async def _add_remote(instance: InstanceModel) -> None:
356361
if internal_ip is not None:
357362
if not is_ip_among_addresses(ip_address=internal_ip, addresses=host_network_addresses):
358363
instance.status = InstanceStatus.TERMINATED
359-
instance.termination_reason = (
364+
instance.termination_reason = InstanceTerminationReason.ERROR.value
365+
instance.termination_reason_message = (
360366
"Specified internal IP not found among instance interfaces"
361367
)
362368
logger.warning(
@@ -378,7 +384,8 @@ async def _add_remote(instance: InstanceModel) -> None:
378384
instance.total_blocks = blocks
379385
else:
380386
instance.status = InstanceStatus.TERMINATED
381-
instance.termination_reason = "Cannot split into blocks"
387+
instance.termination_reason = InstanceTerminationReason.ERROR.value
388+
instance.termination_reason_message = "Cannot split into blocks"
382389
logger.warning(
383390
"Failed to add instance %s: cannot split into blocks",
384391
instance.name,
@@ -497,7 +504,8 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
497504
requirements = get_instance_requirements(instance)
498505
except ValidationError as e:
499506
instance.status = InstanceStatus.TERMINATED
500-
instance.termination_reason = (
507+
instance.termination_reason = InstanceTerminationReason.ERROR.value
508+
instance.termination_reason_message = (
501509
f"Error to parse profile, requirements or instance_configuration: {e}"
502510
)
503511
logger.warning(
@@ -645,7 +653,7 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
645653
)
646654
return
647655

648-
_mark_terminated(instance, "All offers failed" if offers else "No offers found")
656+
_mark_terminated(instance, InstanceTerminationReason.NO_OFFERS.value)
649657
if (
650658
instance.fleet
651659
and _is_fleet_master_instance(instance)
@@ -656,7 +664,7 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
656664
for sibling_instance in instance.fleet.instances:
657665
if sibling_instance.id == instance.id:
658666
continue
659-
_mark_terminated(sibling_instance, "Master instance failed to start")
667+
_mark_terminated(sibling_instance, InstanceTerminationReason.MASTER_FAILED.value)
660668

661669

662670
def _mark_terminated(instance: InstanceModel, termination_reason: str) -> None:
@@ -681,7 +689,7 @@ async def _check_instance(session: AsyncSession, instance: InstanceModel) -> Non
681689
):
682690
# A busy instance could have no active jobs due to this bug: https://github.com/dstackai/dstack/issues/2068
683691
instance.status = InstanceStatus.TERMINATING
684-
instance.termination_reason = "Instance job finished"
692+
instance.termination_reason = InstanceTerminationReason.JOB_FINISHED.value
685693
logger.info(
686694
"Detected busy instance %s with finished job. Marked as TERMINATING",
687695
instance.name,
@@ -810,7 +818,7 @@ async def _check_instance(session: AsyncSession, instance: InstanceModel) -> Non
810818
deadline = instance.termination_deadline
811819
if get_current_datetime() > deadline:
812820
instance.status = InstanceStatus.TERMINATING
813-
instance.termination_reason = "Termination deadline"
821+
instance.termination_reason = InstanceTerminationReason.TERMINATION_TIMEOUT.value
814822
logger.warning(
815823
"Instance %s shim waiting timeout. Marked as TERMINATING",
816824
instance.name,
@@ -839,7 +847,7 @@ async def _wait_for_instance_provisioning_data(
839847
"Instance %s failed because instance has not become running in time", instance.name
840848
)
841849
instance.status = InstanceStatus.TERMINATING
842-
instance.termination_reason = "Instance has not become running in time"
850+
instance.termination_reason = InstanceTerminationReason.STARTING_TIMEOUT.value
843851
return
844852

845853
backend = await backends_services.get_project_backend_by_type(
@@ -852,7 +860,8 @@ async def _wait_for_instance_provisioning_data(
852860
instance.name,
853861
)
854862
instance.status = InstanceStatus.TERMINATING
855-
instance.termination_reason = "Backend not available"
863+
instance.termination_reason = InstanceTerminationReason.ERROR.value
864+
instance.termination_reason_message = "Backend not available"
856865
return
857866
try:
858867
await run_async(
@@ -869,7 +878,8 @@ async def _wait_for_instance_provisioning_data(
869878
repr(e),
870879
)
871880
instance.status = InstanceStatus.TERMINATING
872-
instance.termination_reason = "Error while waiting for instance to become running"
881+
instance.termination_reason = InstanceTerminationReason.ERROR.value
882+
instance.termination_reason_message = "Error while waiting for instance to become running"
873883
except Exception:
874884
logger.exception(
875885
"Got exception when updating instance %s provisioning data", instance.name

src/dstack/_internal/server/background/tasks/process_running_jobs.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from dstack._internal.core.models.files import FileArchiveMapping
1919
from dstack._internal.core.models.instances import (
2020
InstanceStatus,
21+
InstanceTerminationReason,
2122
RemoteConnectionInfo,
2223
SSHConnectionParams,
2324
)
@@ -372,6 +373,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
372373
job_model.status = JobStatus.TERMINATING
373374
# job will be terminated and instance will be emptied by process_terminating_jobs
374375
else:
376+
# job_model.instance.termination_reason
375377
# No job_model.termination_reason set means ssh connection failed
376378
if job_model.disconnected_at is None:
377379
job_model.disconnected_at = common_utils.get_current_datetime()
@@ -383,7 +385,14 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
383385
)
384386
# TODO: Replace with JobTerminationReason.INSTANCE_UNREACHABLE in 0.20 or
385387
# when CLI <= 0.19.8 is no longer supported
386-
job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
388+
if (
389+
job_model.instance is not None
390+
and job_model.instance.termination_reason
391+
== InstanceTerminationReason.NO_BALANCE.value
392+
):
393+
job_model.termination_reason = JobTerminationReason.NO_BALANCE
394+
else:
395+
job_model.termination_reason = JobTerminationReason.INSTANCE_UNREACHABLE
387396
job_model.status = JobStatus.TERMINATING
388397
else:
389398
logger.warning(
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
"""instance.termination_reason_message
2+
3+
Revision ID: a16a05249504
4+
Revises: 2498ab323443
5+
Create Date: 2025-10-13 15:29:56.691164
6+
7+
"""
8+
9+
import sqlalchemy as sa
10+
from alembic import op
11+
12+
# revision identifiers, used by Alembic.
13+
revision = "a16a05249504"
14+
down_revision = "2498ab323443"
15+
branch_labels = None
16+
depends_on = None
17+
18+
19+
def upgrade() -> None:
20+
# ### commands auto generated by Alembic - please adjust! ###
21+
with op.batch_alter_table("instances", schema=None) as batch_op:
22+
batch_op.add_column(
23+
sa.Column("termination_reason_message", sa.String(length=4000), nullable=True)
24+
)
25+
26+
# ### end Alembic commands ###
27+
28+
29+
def downgrade() -> None:
30+
# ### commands auto generated by Alembic - please adjust! ###
31+
with op.batch_alter_table("instances", schema=None) as batch_op:
32+
batch_op.drop_column("termination_reason_message")
33+
34+
# ### end Alembic commands ###

src/dstack/_internal/server/models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -615,6 +615,7 @@ class InstanceModel(BaseModel):
615615
# instance termination handling
616616
termination_deadline: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
617617
termination_reason: Mapped[Optional[str]] = mapped_column(String(4000))
618+
termination_reason_message: Mapped[Optional[str]] = mapped_column(String(4000))
618619
# Deprecated since 0.19.22, not used
619620
health_status: Mapped[Optional[str]] = mapped_column(String(4000))
620621
health: Mapped[HealthStatus] = mapped_column(

src/dstack/_internal/server/services/instances.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ def instance_model_to_instance(instance_model: InstanceModel) -> Instance:
122122
unreachable=instance_model.unreachable,
123123
health_status=instance_model.health,
124124
termination_reason=instance_model.termination_reason,
125+
termination_reason_message=instance_model.termination_reason_message,
125126
created=instance_model.created_at,
126127
total_blocks=instance_model.total_blocks,
127128
busy_blocks=instance_model.busy_blocks,

src/tests/_internal/server/background/tasks/test_process_instances.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
InstanceOffer,
2828
InstanceOfferWithAvailability,
2929
InstanceStatus,
30+
InstanceTerminationReason,
3031
InstanceType,
3132
Resources,
3233
)
@@ -251,7 +252,7 @@ async def test_check_shim_terminate_instance_by_deadline(self, test_db, session:
251252
assert instance is not None
252253
assert instance.status == InstanceStatus.TERMINATING
253254
assert instance.termination_deadline == termination_deadline_time
254-
assert instance.termination_reason == "Termination deadline"
255+
assert instance.termination_reason == InstanceTerminationReason.TERMINATION_TIMEOUT.value
255256

256257
@pytest.mark.asyncio
257258
@pytest.mark.parametrize(
@@ -510,7 +511,7 @@ async def test_terminate_by_idle_timeout(self, test_db, session: AsyncSession):
510511
await session.refresh(instance)
511512
assert instance is not None
512513
assert instance.status == InstanceStatus.TERMINATING
513-
assert instance.termination_reason == "Idle timeout"
514+
assert instance.termination_reason == InstanceTerminationReason.IDLE_TIMEOUT.value
514515

515516

516517
class TestSSHInstanceTerminateProvisionTimeoutExpired:
@@ -531,7 +532,7 @@ async def test_terminate_by_idle_timeout(self, test_db, session: AsyncSession):
531532

532533
await session.refresh(instance)
533534
assert instance.status == InstanceStatus.TERMINATED
534-
assert instance.termination_reason == "Provisioning timeout expired"
535+
assert instance.termination_reason == InstanceTerminationReason.PROOVISIONING_TIMEOUT.value
535536

536537

537538
class TestTerminate:
@@ -800,7 +801,7 @@ async def test_fails_if_all_offers_fail(self, session: AsyncSession, err: Except
800801

801802
await session.refresh(instance)
802803
assert instance.status == InstanceStatus.TERMINATED
803-
assert instance.termination_reason == "All offers failed"
804+
assert instance.termination_reason == InstanceTerminationReason.NO_OFFERS.value
804805

805806
async def test_fails_if_no_offers(self, session: AsyncSession):
806807
project = await create_project(session=session)
@@ -813,19 +814,19 @@ async def test_fails_if_no_offers(self, session: AsyncSession):
813814

814815
await session.refresh(instance)
815816
assert instance.status == InstanceStatus.TERMINATED
816-
assert instance.termination_reason == "No offers found"
817+
assert instance.termination_reason == InstanceTerminationReason.NO_OFFERS.value
817818

818819
@pytest.mark.parametrize(
819820
("placement", "expected_termination_reasons"),
820821
[
821822
pytest.param(
822823
InstanceGroupPlacement.CLUSTER,
823-
{"No offers found": 1, "Master instance failed to start": 3},
824+
{InstanceTerminationReason.NO_OFFERS.value: 1, InstanceTerminationReason.MASTER_FAILED.value: 3},
824825
id="cluster",
825826
),
826827
pytest.param(
827828
None,
828-
{"No offers found": 4},
829+
{InstanceTerminationReason.NO_OFFERS.value: 4},
829830
id="non-cluster",
830831
),
831832
],

0 commit comments

Comments
 (0)