dstackai · Bihan · Aug 7, 2025 · Jul 29, 2025 · Aug 1, 2025 · Aug 6, 2025
diff --git a/docs/docs/concepts/backends.md b/docs/docs/concepts/backends.md
@@ -579,9 +579,9 @@ gcloud projects list --format="json(projectId)"
     Using private subnets assumes that both the `dstack` server and users can access the configured VPC's private subnets.
     Additionally, [Cloud NAT](https://cloud.google.com/nat/docs/overview) must be configured to provide access to external resources for provisioned instances.
 
-## Hotaisle
+## Hot Aisle
 
-Log in to the SSH TUI as described in the [Hotaisle Quick Start :material-arrow-top-right-thin:{ .external }](https://hotaisle.xyz/quick-start/).
+Log in to the SSH TUI as described in the [Hot Aisle Quick Start :material-arrow-top-right-thin:{ .external }](https://hotaisle.xyz/quick-start/).
 Create a new team and generate an API key for the member in the team.
 
 Then, go ahead and configure the backend:
@@ -601,6 +601,12 @@ projects:
 
 </div>
 
+??? info "Required permissions"
+    The API key must have the following roles assigned:
+
+    * **Owner role for the user** - Required for creating and managing SSH keys
+    * **Operator role for the team** - Required for managing virtual machines within the team
+
 ## Lambda
 
 Log into your [Lambda Cloud :material-arrow-top-right-thin:{ .external }](https://lambdalabs.com/service/gpu-cloud) account, click API keys in the sidebar, and then click the `Generate API key`

diff --git a/docs/docs/reference/server/config.yml.md b/docs/docs/reference/server/config.yml.md
@@ -15,7 +15,7 @@ to configure [backends](../../concepts/backends.md) and other [sever-level setti
     overrides:
         show_root_heading: false
         backends:
-            type: 'Union[AWSBackendConfigWithCreds, AzureBackendConfigWithCreds, GCPBackendConfigWithCreds, HotaisleBackendConfigWithCreds, LambdaBackendConfigWithCreds, NebiusBackendConfigWithCreds, RunpodBackendConfigWithCreds, VastAIBackendConfigWithCreds, KubernetesConfig]'
+            type: 'Union[AWSBackendConfigWithCreds, AzureBackendConfigWithCreds, GCPBackendConfigWithCreds, HotAisleBackendConfigWithCreds, LambdaBackendConfigWithCreds, NebiusBackendConfigWithCreds, RunpodBackendConfigWithCreds, VastAIBackendConfigWithCreds, KubernetesConfig]'
 
 #### `projects[n].backends` { #backends data-toc-label="backends" }
 
@@ -128,7 +128,7 @@ to configure [backends](../../concepts/backends.md) and other [sever-level setti
 
 ##### `projects[n].backends[type=hotaisle]` { #hotaisle data-toc-label="hotaisle" }
 
-#SCHEMA# dstack._internal.core.backends.hotaisle.models.HotaisleBackendConfigWithCreds
+#SCHEMA# dstack._internal.core.backends.hotaisle.models.HotAisleBackendConfigWithCreds
     overrides:
         show_root_heading: false
         type:
@@ -137,7 +137,7 @@ to configure [backends](../../concepts/backends.md) and other [sever-level setti
 
 ###### `projects[n].backends[type=hotaisle].creds` { #hotaisle-creds data-toc-label="creds" }
 
-#SCHEMA# dstack._internal.core.backends.hotaisle.models.HotaisleAPIKeyCreds
+#SCHEMA# dstack._internal.core.backends.hotaisle.models.HotAisleAPIKeyCreds
     overrides:
         show_root_heading: false
         type:

diff --git a/src/dstack/_internal/core/backends/configurators.py b/src/dstack/_internal/core/backends/configurators.py
@@ -56,10 +56,10 @@
 
 try:
     from dstack._internal.core.backends.hotaisle.configurator import (
-        HotaisleConfigurator,
+        HotAisleConfigurator,
     )
 
-    _CONFIGURATOR_CLASSES.append(HotaisleConfigurator)
+    _CONFIGURATOR_CLASSES.append(HotAisleConfigurator)
 except ImportError:
     pass
 

diff --git a/src/dstack/_internal/core/backends/hotaisle/api_client.py b/src/dstack/_internal/core/backends/hotaisle/api_client.py
@@ -2,14 +2,15 @@
 
 import requests
 
+from dstack._internal.core.backends.base.configurator import raise_invalid_credentials_error
 from dstack._internal.utils.logging import get_logger
 
 API_URL = "https://admin.hotaisle.app/api"
 
 logger = get_logger(__name__)
 
 
-class HotaisleAPIClient:
+class HotAisleAPIClient:
     def __init__(self, api_key: str, team_handle: str):
         self.api_key = api_key
         self.team_handle = team_handle
@@ -19,28 +20,42 @@ def validate_api_key(self) -> bool:
             self._validate_user_and_team()
             return True
         except requests.HTTPError as e:
-            if e.response.status_code in [401, 403]:
-                return False
+            if e.response.status_code == 401:
+                raise_invalid_credentials_error(
+                    fields=[["creds", "api_key"]], details="Invalid API key"
+                )
+            elif e.response.status_code == 403:
+                raise_invalid_credentials_error(
+                    fields=[["creds", "api_key"]],
+                    details="Authenticated user does note have required permissions",
+                )
+            raise e
+        except ValueError as e:
+            error_message = str(e)
+            if "No Hot Aisle teams found" in error_message:
+                raise_invalid_credentials_error(
+                    fields=[["creds", "api_key"]],
+                    details="Valid API key but no teams found for this user",
+                )
+            elif "not found" in error_message:
+                raise_invalid_credentials_error(
+                    fields=[["team_handle"]], details=f"Team handle '{self.team_handle}' not found"
+                )
             raise e
-        except ValueError:
-            return False
 
     def _validate_user_and_team(self) -> None:
         url = f"{API_URL}/user/"
         response = self._make_request("GET", url)
-
-        if response.ok:
-            user_data = response.json()
-        else:
-            response.raise_for_status()
+        response.raise_for_status()
+        user_data = response.json()
 
         teams = user_data.get("teams", [])
         if not teams:
-            raise ValueError("No Hotaisle teams found for this user")
+            raise ValueError("No Hot Aisle teams found for this user")
 
         available_teams = [team["handle"] for team in teams]
         if self.team_handle not in available_teams:
-            raise ValueError(f"Hotaisle Team '{self.team_handle}' not found.")
+            raise ValueError(f"Hot Aisle team '{self.team_handle}' not found.")
 
     def upload_ssh_key(self, public_key: str) -> bool:
         url = f"{API_URL}/user/ssh_keys/"
@@ -50,47 +65,36 @@ def upload_ssh_key(self, public_key: str) -> bool:
 
         if response.status_code == 409:
             return True  # Key already exists - success
-        if not response.ok:
-            response.raise_for_status()
+        response.raise_for_status()
         return True
 
     def create_virtual_machine(
         self, vm_payload: Dict[str, Any], instance_name: str
     ) -> Dict[str, Any]:
         url = f"{API_URL}/teams/{self.team_handle}/virtual_machines/"
         response = self._make_request("POST", url, json=vm_payload)
-
-        if not response.ok:
-            response.raise_for_status()
-
+        response.raise_for_status()
         vm_data = response.json()
         return vm_data
 
     def get_vm_state(self, vm_name: str) -> str:
         url = f"{API_URL}/teams/{self.team_handle}/virtual_machines/{vm_name}/state/"
         response = self._make_request("GET", url)
-
-        if not response.ok:
-            response.raise_for_status()
-
+        response.raise_for_status()
         state_data = response.json()
         return state_data["state"]
 
-    def terminate_virtual_machine(self, vm_name: str) -> bool:
+    def terminate_virtual_machine(self, vm_name: str) -> None:
         url = f"{API_URL}/teams/{self.team_handle}/virtual_machines/{vm_name}/"
         response = self._make_request("DELETE", url)
-
-        if response.status_code == 204:
-            return True
-        else:
-            response.raise_for_status()
+        response.raise_for_status()
 
     def _make_request(
         self, method: str, url: str, json: Optional[Dict[str, Any]] = None, timeout: int = 30
     ) -> requests.Response:
         headers = {
             "accept": "application/json",
-            "Authorization": self.api_key,
+            "Authorization": f"Token {self.api_key}",
         }
         if json is not None:
             headers["Content-Type"] = "application/json"

diff --git a/src/dstack/_internal/core/backends/hotaisle/backend.py b/src/dstack/_internal/core/backends/hotaisle/backend.py
@@ -1,16 +1,16 @@
 from dstack._internal.core.backends.base.backend import Backend
-from dstack._internal.core.backends.hotaisle.compute import HotaisleCompute
-from dstack._internal.core.backends.hotaisle.models import HotaisleConfig
+from dstack._internal.core.backends.hotaisle.compute import HotAisleCompute
+from dstack._internal.core.backends.hotaisle.models import HotAisleConfig
 from dstack._internal.core.models.backends.base import BackendType
 
 
-class HotaisleBackend(Backend):
+class HotAisleBackend(Backend):
     TYPE = BackendType.HOTAISLE
-    COMPUTE_CLASS = HotaisleCompute
+    COMPUTE_CLASS = HotAisleCompute
 
-    def __init__(self, config: HotaisleConfig):
+    def __init__(self, config: HotAisleConfig):
         self.config = config
-        self._compute = HotaisleCompute(self.config)
+        self._compute = HotAisleCompute(self.config)
 
-    def compute(self) -> HotaisleCompute:
+    def compute(self) -> HotAisleCompute:
         return self._compute
diff --git a/src/dstack/_internal/core/backends/hotaisle/compute.py b/src/dstack/_internal/core/backends/hotaisle/compute.py
@@ -14,9 +14,10 @@
     get_shim_commands,
 )
 from dstack._internal.core.backends.base.offers import get_catalog_offers
-from dstack._internal.core.backends.hotaisle.api_client import HotaisleAPIClient
-from dstack._internal.core.backends.hotaisle.models import HotaisleConfig
+from dstack._internal.core.backends.hotaisle.api_client import HotAisleAPIClient
+from dstack._internal.core.backends.hotaisle.models import HotAisleConfig
 from dstack._internal.core.models.backends.base import BackendType
+from dstack._internal.core.models.common import CoreModel
 from dstack._internal.core.models.instances import (
     InstanceAvailability,
     InstanceConfiguration,
@@ -31,14 +32,28 @@
 MAX_INSTANCE_NAME_LEN = 60
 
 
-class HotaisleCompute(
+INSTANCE_TYPE_SPECS = {
+    "1x MI300X 8x Xeon Platinum 8462Y+": {
+        "cpu_model": "Xeon Platinum 8462Y+",
+        "cpu_frequency": 2800000000,
+        "cpu_manufacturer": "Intel",
+    },
+    "1x MI300X 13x Xeon Platinum 8470": {
+        "cpu_model": "Xeon Platinum 8470",
+        "cpu_frequency": 2000000000,
+        "cpu_manufacturer": "Intel",
+    },
+}
+
+
+class HotAisleCompute(
     ComputeWithCreateInstanceSupport,
     Compute,
 ):
-    def __init__(self, config: HotaisleConfig):
+    def __init__(self, config: HotAisleConfig):
         super().__init__()
         self.config = config
-        self.api_client = HotaisleAPIClient(config.creds.api_key, config.team_handle)
+        self.api_client = HotAisleAPIClient(config.creds.api_key, config.team_handle)
         self.catalog = gpuhunt.Catalog(balance_resources=False, auto_reload=False)
         self.catalog.add_provider(
             HotAisleProvider(api_key=config.creds.api_key, team_handle=config.team_handle)
@@ -53,41 +68,43 @@ def get_offers(
             requirements=requirements,
             catalog=self.catalog,
         )
-        offers = [
-            InstanceOfferWithAvailability(
-                **offer.dict(), availability=InstanceAvailability.AVAILABLE
-            )
-            for offer in offers
-        ]
-        return offers
+
+        supported_offers = []
+        for offer in offers:
+            if offer.instance.name in INSTANCE_TYPE_SPECS:
+                supported_offers.append(
+                    InstanceOfferWithAvailability(
+                        **offer.dict(), availability=InstanceAvailability.AVAILABLE
+                    )
+                )
+            else:
+                logger.warning(
+                    f"Skipping unsupported Hot Aisle instance type: {offer.instance.name}"
+                )
+
+        return supported_offers
 
     def get_payload_from_offer(self, instance_type) -> dict:
-        # Only two instance types are available in Hotaisle with CPUs: 8-core and 13-core. Other fields are
-        # not configurable.
+        instance_type_name = instance_type.name
+        cpu_specs = INSTANCE_TYPE_SPECS[instance_type_name]
         cpu_cores = instance_type.resources.cpus
-        if cpu_cores == 8:
-            cpu_model = "Xeon Platinum 8462Y+"
-            frequency = 2800000000
-        else:  # cpu_cores == 13
-            cpu_model = "Xeon Platinum 8470"
-            frequency = 2000000000
 
         return {
             "cpu_cores": cpu_cores,
             "cpus": {
                 "count": 1,
-                "manufacturer": "Intel",
-                "model": cpu_model,
+                "manufacturer": cpu_specs["cpu_manufacturer"],
+                "model": cpu_specs["cpu_model"],
                 "cores": cpu_cores,
-                "frequency": frequency,
+                "frequency": cpu_specs["cpu_frequency"],
             },
-            "disk_capacity": 13194139533312,
-            "ram_capacity": 240518168576,
+            "disk_capacity": instance_type.resources.disk.size_mib * 1024**2,
+            "ram_capacity": instance_type.resources.memory_mib * 1024**2,
             "gpus": [
                 {
                     "count": len(instance_type.resources.gpus),
-                    "manufacturer": "AMD",
-                    "model": "MI300X",
+                    "manufacturer": instance_type.resources.gpus[0].vendor,
+                    "model": instance_type.resources.gpus[0].name,
                 }
             ],
         }
@@ -117,7 +134,9 @@ def create_instance(
             ssh_port=22,
             dockerized=True,
             ssh_proxy=None,
-            backend_data=vm_data["ip_address"],
+            backend_data=HotAisleInstanceBackendData(
+                ip_address=vm_data["ip_address"], vm_id=vm_data["name"]
+            ).json(),
         )
 
     def update_provisioning_data(
@@ -129,7 +148,8 @@ def update_provisioning_data(
         vm_state = self.api_client.get_vm_state(provisioning_data.instance_id)
         if vm_state == "running":
             if provisioning_data.hostname is None and provisioning_data.backend_data:
-                provisioning_data.hostname = provisioning_data.backend_data
+                backend_data = HotAisleInstanceBackendData.load(provisioning_data.backend_data)
+                provisioning_data.hostname = backend_data.ip_address
             commands = get_shim_commands(
                 authorized_keys=[project_ssh_public_key],
                 arch=provisioning_data.instance_type.resources.cpu_arch,
@@ -211,3 +231,13 @@ def _run_ssh_command(hostname: str, ssh_private_key: str, command: str):
             stdout=subprocess.DEVNULL,
             stderr=subprocess.DEVNULL,
         )
+
+
+class HotAisleInstanceBackendData(CoreModel):
+    ip_address: str
+    vm_id: Optional[str] = None
+
+    @classmethod
+    def load(cls, raw: Optional[str]) -> "HotAisleInstanceBackendData":
+        assert raw is not None
+        return cls.__response__.parse_raw(raw)