Skip to content

Commit 58181d1

Browse files
authored
Fix trying fleet instance offers (#2443)
- In case of unexpected provisioning errors, move on to the next offer instead of retrying the same offer indefinitely. - Limit the number of offers tried to prevent long processing in case all offers fail. - (chore) Remove `except NotImplementedError`, offers are already filtered to only include backends with `create_instance` support.
1 parent 621b18d commit 58181d1

File tree

2 files changed

+79
-6
lines changed

2 files changed

+79
-6
lines changed

src/dstack/_internal/server/background/tasks/process_instances.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@
6464
Retry,
6565
)
6666
from dstack._internal.core.services.profiles import get_retry
67+
from dstack._internal.server import settings as server_settings
6768
from dstack._internal.server.background.tasks.common import get_provisioning_timeout
6869
from dstack._internal.server.db import get_session_ctx
6970
from dstack._internal.server.models import (
@@ -529,7 +530,9 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
529530
session=session, fleet_id=instance.fleet_id
530531
)
531532

532-
for backend, instance_offer in offers:
533+
# Limit number of offers tried to prevent long-running processing
534+
# in case all offers fail.
535+
for backend, instance_offer in offers[: server_settings.MAX_OFFERS_TRIED]:
533536
if instance_offer.backend not in BACKENDS_WITH_CREATE_INSTANCE_SUPPORT:
534537
continue
535538
compute = backend.compute()
@@ -578,8 +581,13 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
578581
extra={"instance_name": instance.name},
579582
)
580583
continue
581-
except NotImplementedError:
582-
# skip a backend without create_instance support, continue with next backend and offer
584+
except Exception:
585+
logger.exception(
586+
"Got exception when launching %s in %s/%s",
587+
instance_offer.instance.name,
588+
instance_offer.backend.value,
589+
instance_offer.region,
590+
)
583591
continue
584592

585593
instance.status = InstanceStatus.PROVISIONING
@@ -607,10 +615,11 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
607615

608616
if not should_retry:
609617
instance.status = InstanceStatus.TERMINATED
610-
instance.termination_reason = "No offers found"
618+
instance.termination_reason = "All offers failed" if offers else "No offers found"
611619
logger.info(
612-
"No offers found. Terminated instance %s",
620+
"Terminated instance %s: %s",
613621
instance.name,
622+
instance.termination_reason,
614623
extra={
615624
"instance_name": instance.name,
616625
"instance_status": InstanceStatus.TERMINATED.value,

src/tests/_internal/server/background/tasks/test_process_instances.py

Lines changed: 65 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from freezegun import freeze_time
99
from sqlalchemy.ext.asyncio import AsyncSession
1010

11-
from dstack._internal.core.errors import BackendError
11+
from dstack._internal.core.errors import BackendError, ProvisioningError
1212
from dstack._internal.core.models.backends.base import BackendType
1313
from dstack._internal.core.models.instances import (
1414
Gpu,
@@ -35,6 +35,8 @@
3535
create_repo,
3636
create_run,
3737
create_user,
38+
get_instance_offer_with_availability,
39+
get_job_provisioning_data,
3840
get_remote_connection_info,
3941
)
4042
from dstack._internal.utils.common import get_current_datetime
@@ -557,6 +559,68 @@ async def test_creates_instance(
557559
assert instance.total_blocks == expected_blocks
558560
assert instance.busy_blocks == 0
559561

562+
@pytest.mark.parametrize("err", [RuntimeError("Unexpected"), ProvisioningError("Expected")])
563+
async def test_tries_second_offer_if_first_fails(self, session: AsyncSession, err: Exception):
564+
project = await create_project(session=session)
565+
instance = await create_instance(
566+
session=session, project=project, status=InstanceStatus.PENDING
567+
)
568+
aws_mock = Mock()
569+
aws_mock.TYPE = BackendType.AWS
570+
offer = get_instance_offer_with_availability(backend=BackendType.AWS, price=1.0)
571+
aws_mock.compute.return_value = Mock(spec=ComputeMockSpec)
572+
aws_mock.compute.return_value.get_offers_cached.return_value = [offer]
573+
aws_mock.compute.return_value.create_instance.side_effect = err
574+
gcp_mock = Mock()
575+
gcp_mock.TYPE = BackendType.GCP
576+
offer = get_instance_offer_with_availability(backend=BackendType.GCP, price=2.0)
577+
gcp_mock.compute.return_value = Mock(spec=ComputeMockSpec)
578+
gcp_mock.compute.return_value.get_offers_cached.return_value = [offer]
579+
gcp_mock.compute.return_value.create_instance.return_value = get_job_provisioning_data(
580+
backend=offer.backend, region=offer.region, price=offer.price
581+
)
582+
with patch("dstack._internal.server.services.backends.get_project_backends") as m:
583+
m.return_value = [aws_mock, gcp_mock]
584+
await process_instances()
585+
586+
await session.refresh(instance)
587+
assert instance.status == InstanceStatus.PROVISIONING
588+
aws_mock.compute.return_value.create_instance.assert_called_once()
589+
assert instance.backend == BackendType.GCP
590+
591+
@pytest.mark.parametrize("err", [RuntimeError("Unexpected"), ProvisioningError("Expected")])
592+
async def test_fails_if_all_offers_fail(self, session: AsyncSession, err: Exception):
593+
project = await create_project(session=session)
594+
instance = await create_instance(
595+
session=session, project=project, status=InstanceStatus.PENDING
596+
)
597+
aws_mock = Mock()
598+
aws_mock.TYPE = BackendType.AWS
599+
offer = get_instance_offer_with_availability(backend=BackendType.AWS, price=1.0)
600+
aws_mock.compute.return_value = Mock(spec=ComputeMockSpec)
601+
aws_mock.compute.return_value.get_offers_cached.return_value = [offer]
602+
aws_mock.compute.return_value.create_instance.side_effect = err
603+
with patch("dstack._internal.server.services.backends.get_project_backends") as m:
604+
m.return_value = [aws_mock]
605+
await process_instances()
606+
607+
await session.refresh(instance)
608+
assert instance.status == InstanceStatus.TERMINATED
609+
assert instance.termination_reason == "All offers failed"
610+
611+
async def test_fails_if_no_offers(self, session: AsyncSession):
612+
project = await create_project(session=session)
613+
instance = await create_instance(
614+
session=session, project=project, status=InstanceStatus.PENDING
615+
)
616+
with patch("dstack._internal.server.services.backends.get_project_backends") as m:
617+
m.return_value = []
618+
await process_instances()
619+
620+
await session.refresh(instance)
621+
assert instance.status == InstanceStatus.TERMINATED
622+
assert instance.termination_reason == "No offers found"
623+
560624

561625
@pytest.mark.asyncio
562626
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)

0 commit comments

Comments
 (0)