|
83 | 83 | from dstack._internal.server.services.fleets import ( |
84 | 84 | fleet_model_to_fleet, |
85 | 85 | get_create_instance_offers, |
86 | | - get_fleet_spec, |
87 | 86 | ) |
88 | 87 | from dstack._internal.server.services.instances import ( |
89 | 88 | get_instance_configuration, |
@@ -574,10 +573,7 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No |
574 | 573 | _is_fleet_master_instance(instance) |
575 | 574 | and instance_offer.backend in BACKENDS_WITH_PLACEMENT_GROUPS_SUPPORT |
576 | 575 | and instance.fleet |
577 | | - and ( |
578 | | - get_fleet_spec(instance.fleet).configuration.placement |
579 | | - == InstanceGroupPlacement.CLUSTER |
580 | | - ) |
| 576 | + and _is_cloud_cluster(instance.fleet) |
581 | 577 | ): |
582 | 578 | assert isinstance(compute, ComputeWithPlacementGroupSupport) |
583 | 579 | placement_group_model = _find_suitable_placement_group( |
@@ -666,37 +662,32 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No |
666 | 662 | instance.last_retry_at = get_current_datetime() |
667 | 663 |
|
668 | 664 | if not should_retry: |
669 | | - instance.status = InstanceStatus.TERMINATED |
670 | | - instance.termination_reason = "All offers failed" if offers else "No offers found" |
671 | | - logger.info( |
672 | | - "Terminated instance %s: %s", |
673 | | - instance.name, |
674 | | - instance.termination_reason, |
675 | | - extra={ |
676 | | - "instance_name": instance.name, |
677 | | - "instance_status": InstanceStatus.TERMINATED.value, |
678 | | - }, |
679 | | - ) |
680 | | - if instance.fleet and _is_fleet_master_instance(instance): |
| 665 | + _mark_terminated(instance, "All offers failed" if offers else "No offers found") |
| 666 | + if ( |
| 667 | + instance.fleet |
| 668 | + and _is_fleet_master_instance(instance) |
| 669 | + and _is_cloud_cluster(instance.fleet) |
| 670 | + ): |
681 | 671 | # Do not attempt to deploy other instances, as they won't determine the correct cluster |
682 | 672 | # backend, region, and placement group without a successfully deployed master instance |
683 | | - # FIXME(critical): this should only apply to placement: cluster |
684 | 673 | for sibling_instance in instance.fleet.instances: |
685 | 674 | if sibling_instance.id == instance.id: |
686 | 675 | continue |
687 | | - if sibling_instance.status == InstanceStatus.PENDING: |
688 | | - sibling_instance.status = InstanceStatus.TERMINATED |
689 | | - else: |
690 | | - logger.error( |
691 | | - "Instance %s has unexpected status %s." |
692 | | - " Should have been %s, as master instance %s has not been provisioned", |
693 | | - sibling_instance.name, |
694 | | - sibling_instance.status.value, |
695 | | - InstanceStatus.PENDING.value, |
696 | | - instance.name, |
697 | | - ) |
698 | | - sibling_instance.status = InstanceStatus.TERMINATING |
699 | | - sibling_instance.termination_reason = "Master instance failed to start" |
| 676 | + _mark_terminated(sibling_instance, "Master instance failed to start") |
| 677 | + |
| 678 | + |
| 679 | +def _mark_terminated(instance: InstanceModel, termination_reason: str) -> None: |
| 680 | + instance.status = InstanceStatus.TERMINATED |
| 681 | + instance.termination_reason = termination_reason |
| 682 | + logger.info( |
| 683 | + "Terminated instance %s: %s", |
| 684 | + instance.name, |
| 685 | + instance.termination_reason, |
| 686 | + extra={ |
| 687 | + "instance_name": instance.name, |
| 688 | + "instance_status": InstanceStatus.TERMINATED.value, |
| 689 | + }, |
| 690 | + ) |
700 | 691 |
|
701 | 692 |
|
702 | 693 | async def _check_instance(instance: InstanceModel) -> None: |
@@ -980,17 +971,21 @@ def _need_to_wait_fleet_provisioning(instance: InstanceModel) -> bool: |
980 | 971 | or instance.fleet.instances[0].status == InstanceStatus.TERMINATED |
981 | 972 | ): |
982 | 973 | return False |
983 | | - fleet = fleet_model_to_fleet(instance.fleet) |
984 | | - return ( |
985 | | - fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER |
986 | | - and fleet.spec.configuration.ssh_config is None |
987 | | - ) |
| 974 | + return _is_cloud_cluster(instance.fleet) |
988 | 975 |
|
989 | 976 |
|
990 | 977 | def _is_fleet_master_instance(instance: InstanceModel) -> bool: |
991 | 978 | return instance.fleet is not None and instance.id == instance.fleet.instances[0].id |
992 | 979 |
|
993 | 980 |
|
| 981 | +def _is_cloud_cluster(fleet_model: FleetModel) -> bool: |
| 982 | + fleet = fleet_model_to_fleet(fleet_model) |
| 983 | + return ( |
| 984 | + fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER |
| 985 | + and fleet.spec.configuration.ssh_config is None |
| 986 | + ) |
| 987 | + |
| 988 | + |
994 | 989 | def _get_instance_offer_for_instance( |
995 | 990 | instance_offer: InstanceOfferWithAvailability, |
996 | 991 | instance: InstanceModel, |
|
0 commit comments