Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions docs/docs/concepts/backends.md
Original file line number Diff line number Diff line change
Expand Up @@ -579,9 +579,9 @@ gcloud projects list --format="json(projectId)"
Using private subnets assumes that both the `dstack` server and users can access the configured VPC's private subnets.
Additionally, [Cloud NAT](https://cloud.google.com/nat/docs/overview) must be configured to provide access to external resources for provisioned instances.

## Hotaisle
## Hot Aisle

Log in to the SSH TUI as described in the [Hotaisle Quick Start :material-arrow-top-right-thin:{ .external }](https://hotaisle.xyz/quick-start/).
Log in to the SSH TUI as described in the [Hot Aisle Quick Start :material-arrow-top-right-thin:{ .external }](https://hotaisle.xyz/quick-start/).
Create a new team and generate an API key for the member in the team.
Comment thread
jvstme marked this conversation as resolved.

Then, go ahead and configure the backend:
Expand All @@ -601,6 +601,12 @@ projects:

</div>

??? info "Required permissions"
The API key must have the following roles assigned:

* **Owner role for the user** - Required for creating and managing SSH keys
* **Operator role for the team** - Required for managing virtual machines within the team

## Lambda

Log into your [Lambda Cloud :material-arrow-top-right-thin:{ .external }](https://lambdalabs.com/service/gpu-cloud) account, click API keys in the sidebar, and then click the `Generate API key`
Expand Down
6 changes: 3 additions & 3 deletions docs/docs/reference/server/config.yml.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ to configure [backends](../../concepts/backends.md) and other [sever-level setti
overrides:
show_root_heading: false
backends:
type: 'Union[AWSBackendConfigWithCreds, AzureBackendConfigWithCreds, GCPBackendConfigWithCreds, HotaisleBackendConfigWithCreds, LambdaBackendConfigWithCreds, NebiusBackendConfigWithCreds, RunpodBackendConfigWithCreds, VastAIBackendConfigWithCreds, KubernetesConfig]'
type: 'Union[AWSBackendConfigWithCreds, AzureBackendConfigWithCreds, GCPBackendConfigWithCreds, HotAisleBackendConfigWithCreds, LambdaBackendConfigWithCreds, NebiusBackendConfigWithCreds, RunpodBackendConfigWithCreds, VastAIBackendConfigWithCreds, KubernetesConfig]'

#### `projects[n].backends` { #backends data-toc-label="backends" }

Expand Down Expand Up @@ -128,7 +128,7 @@ to configure [backends](../../concepts/backends.md) and other [sever-level setti

##### `projects[n].backends[type=hotaisle]` { #hotaisle data-toc-label="hotaisle" }

#SCHEMA# dstack._internal.core.backends.hotaisle.models.HotaisleBackendConfigWithCreds
#SCHEMA# dstack._internal.core.backends.hotaisle.models.HotAisleBackendConfigWithCreds
overrides:
show_root_heading: false
type:
Expand All @@ -137,7 +137,7 @@ to configure [backends](../../concepts/backends.md) and other [sever-level setti

###### `projects[n].backends[type=hotaisle].creds` { #hotaisle-creds data-toc-label="creds" }

#SCHEMA# dstack._internal.core.backends.hotaisle.models.HotaisleAPIKeyCreds
#SCHEMA# dstack._internal.core.backends.hotaisle.models.HotAisleAPIKeyCreds
overrides:
show_root_heading: false
type:
Expand Down
4 changes: 2 additions & 2 deletions src/dstack/_internal/core/backends/configurators.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,10 @@

try:
from dstack._internal.core.backends.hotaisle.configurator import (
HotaisleConfigurator,
HotAisleConfigurator,
)

_CONFIGURATOR_CLASSES.append(HotaisleConfigurator)
_CONFIGURATOR_CLASSES.append(HotAisleConfigurator)
except ImportError:
pass

Expand Down
62 changes: 33 additions & 29 deletions src/dstack/_internal/core/backends/hotaisle/api_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,15 @@

import requests

from dstack._internal.core.backends.base.configurator import raise_invalid_credentials_error
from dstack._internal.utils.logging import get_logger

API_URL = "https://admin.hotaisle.app/api"

logger = get_logger(__name__)


class HotaisleAPIClient:
class HotAisleAPIClient:
def __init__(self, api_key: str, team_handle: str):
self.api_key = api_key
self.team_handle = team_handle
Expand All @@ -19,28 +20,42 @@ def validate_api_key(self) -> bool:
self._validate_user_and_team()
return True
except requests.HTTPError as e:
if e.response.status_code in [401, 403]:
return False
if e.response.status_code == 401:
raise_invalid_credentials_error(
fields=[["creds", "api_key"]], details="Invalid API key"
)
elif e.response.status_code == 403:
raise_invalid_credentials_error(
fields=[["creds", "api_key"]],
details="Authenticated user does note have required permissions",
)
raise e
except ValueError as e:
error_message = str(e)
if "No Hot Aisle teams found" in error_message:
raise_invalid_credentials_error(
fields=[["creds", "api_key"]],
details="Valid API key but no teams found for this user",
)
elif "not found" in error_message:
raise_invalid_credentials_error(
fields=[["team_handle"]], details=f"Team handle '{self.team_handle}' not found"
)
Comment on lines +33 to +43
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(nit) Looking for patterns in our own error messages and then raising with another error message looks quite redundant. It's also error-prone, because we can change the error message in _validate_user_and_team and forget to change it here.

Some alternatives I can suggest:

  • Raise with the same error message - raise_invalid_credentials_error(details=str(e), ...)
  • Call raise_invalid_credentials_error directly in _validate_user_and_team
  • (my favorite) Merge validate_api_key and _validate_user_and_team into one method and call raise_invalid_credentials_error directly

raise e
except ValueError:
return False

def _validate_user_and_team(self) -> None:
url = f"{API_URL}/user/"
response = self._make_request("GET", url)

if response.ok:
user_data = response.json()
else:
response.raise_for_status()
response.raise_for_status()
user_data = response.json()

teams = user_data.get("teams", [])
if not teams:
raise ValueError("No Hotaisle teams found for this user")
raise ValueError("No Hot Aisle teams found for this user")

available_teams = [team["handle"] for team in teams]
if self.team_handle not in available_teams:
raise ValueError(f"Hotaisle Team '{self.team_handle}' not found.")
raise ValueError(f"Hot Aisle team '{self.team_handle}' not found.")

def upload_ssh_key(self, public_key: str) -> bool:
url = f"{API_URL}/user/ssh_keys/"
Expand All @@ -50,47 +65,36 @@ def upload_ssh_key(self, public_key: str) -> bool:

if response.status_code == 409:
return True # Key already exists - success
if not response.ok:
response.raise_for_status()
response.raise_for_status()
return True

def create_virtual_machine(
self, vm_payload: Dict[str, Any], instance_name: str
) -> Dict[str, Any]:
url = f"{API_URL}/teams/{self.team_handle}/virtual_machines/"
response = self._make_request("POST", url, json=vm_payload)

if not response.ok:
response.raise_for_status()

response.raise_for_status()
vm_data = response.json()
return vm_data

def get_vm_state(self, vm_name: str) -> str:
url = f"{API_URL}/teams/{self.team_handle}/virtual_machines/{vm_name}/state/"
response = self._make_request("GET", url)

if not response.ok:
response.raise_for_status()

response.raise_for_status()
state_data = response.json()
return state_data["state"]

def terminate_virtual_machine(self, vm_name: str) -> bool:
def terminate_virtual_machine(self, vm_name: str) -> None:
url = f"{API_URL}/teams/{self.team_handle}/virtual_machines/{vm_name}/"
response = self._make_request("DELETE", url)

if response.status_code == 204:
return True
else:
response.raise_for_status()
response.raise_for_status()

def _make_request(
self, method: str, url: str, json: Optional[Dict[str, Any]] = None, timeout: int = 30
) -> requests.Response:
headers = {
"accept": "application/json",
"Authorization": self.api_key,
"Authorization": f"Token {self.api_key}",
}
if json is not None:
headers["Content-Type"] = "application/json"
Expand Down
14 changes: 7 additions & 7 deletions src/dstack/_internal/core/backends/hotaisle/backend.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
from dstack._internal.core.backends.base.backend import Backend
from dstack._internal.core.backends.hotaisle.compute import HotaisleCompute
from dstack._internal.core.backends.hotaisle.models import HotaisleConfig
from dstack._internal.core.backends.hotaisle.compute import HotAisleCompute
from dstack._internal.core.backends.hotaisle.models import HotAisleConfig
from dstack._internal.core.models.backends.base import BackendType


class HotaisleBackend(Backend):
class HotAisleBackend(Backend):
TYPE = BackendType.HOTAISLE
COMPUTE_CLASS = HotaisleCompute
COMPUTE_CLASS = HotAisleCompute

def __init__(self, config: HotaisleConfig):
def __init__(self, config: HotAisleConfig):
self.config = config
self._compute = HotaisleCompute(self.config)
self._compute = HotAisleCompute(self.config)

def compute(self) -> HotaisleCompute:
def compute(self) -> HotAisleCompute:
return self._compute
88 changes: 59 additions & 29 deletions src/dstack/_internal/core/backends/hotaisle/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,10 @@
get_shim_commands,
)
from dstack._internal.core.backends.base.offers import get_catalog_offers
from dstack._internal.core.backends.hotaisle.api_client import HotaisleAPIClient
from dstack._internal.core.backends.hotaisle.models import HotaisleConfig
from dstack._internal.core.backends.hotaisle.api_client import HotAisleAPIClient
from dstack._internal.core.backends.hotaisle.models import HotAisleConfig
from dstack._internal.core.models.backends.base import BackendType
from dstack._internal.core.models.common import CoreModel
from dstack._internal.core.models.instances import (
InstanceAvailability,
InstanceConfiguration,
Expand All @@ -31,14 +32,28 @@
MAX_INSTANCE_NAME_LEN = 60
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(nit) Unused



class HotaisleCompute(
INSTANCE_TYPE_SPECS = {
"1x MI300X 8x Xeon Platinum 8462Y+": {
"cpu_model": "Xeon Platinum 8462Y+",
"cpu_frequency": 2800000000,
"cpu_manufacturer": "Intel",
},
"1x MI300X 13x Xeon Platinum 8470": {
"cpu_model": "Xeon Platinum 8470",
"cpu_frequency": 2000000000,
"cpu_manufacturer": "Intel",
},
}


class HotAisleCompute(
ComputeWithCreateInstanceSupport,
Compute,
):
def __init__(self, config: HotaisleConfig):
def __init__(self, config: HotAisleConfig):
super().__init__()
self.config = config
self.api_client = HotaisleAPIClient(config.creds.api_key, config.team_handle)
self.api_client = HotAisleAPIClient(config.creds.api_key, config.team_handle)
self.catalog = gpuhunt.Catalog(balance_resources=False, auto_reload=False)
self.catalog.add_provider(
HotAisleProvider(api_key=config.creds.api_key, team_handle=config.team_handle)
Expand All @@ -53,41 +68,43 @@ def get_offers(
requirements=requirements,
catalog=self.catalog,
)
offers = [
InstanceOfferWithAvailability(
**offer.dict(), availability=InstanceAvailability.AVAILABLE
)
for offer in offers
]
return offers

supported_offers = []
for offer in offers:
if offer.instance.name in INSTANCE_TYPE_SPECS:
supported_offers.append(
InstanceOfferWithAvailability(
**offer.dict(), availability=InstanceAvailability.AVAILABLE
)
)
else:
logger.warning(
f"Skipping unsupported Hot Aisle instance type: {offer.instance.name}"
)

return supported_offers

def get_payload_from_offer(self, instance_type) -> dict:
# Only two instance types are available in Hotaisle with CPUs: 8-core and 13-core. Other fields are
# not configurable.
instance_type_name = instance_type.name
cpu_specs = INSTANCE_TYPE_SPECS[instance_type_name]
cpu_cores = instance_type.resources.cpus
if cpu_cores == 8:
cpu_model = "Xeon Platinum 8462Y+"
frequency = 2800000000
else: # cpu_cores == 13
cpu_model = "Xeon Platinum 8470"
frequency = 2000000000

return {
"cpu_cores": cpu_cores,
"cpus": {
"count": 1,
"manufacturer": "Intel",
"model": cpu_model,
"manufacturer": cpu_specs["cpu_manufacturer"],
"model": cpu_specs["cpu_model"],
"cores": cpu_cores,
"frequency": frequency,
"frequency": cpu_specs["cpu_frequency"],
},
"disk_capacity": 13194139533312,
"ram_capacity": 240518168576,
"disk_capacity": instance_type.resources.disk.size_mib * 1024**2,
"ram_capacity": instance_type.resources.memory_mib * 1024**2,
"gpus": [
{
"count": len(instance_type.resources.gpus),
"manufacturer": "AMD",
"model": "MI300X",
"manufacturer": instance_type.resources.gpus[0].vendor,
"model": instance_type.resources.gpus[0].name,
}
],
}
Expand Down Expand Up @@ -117,7 +134,9 @@ def create_instance(
ssh_port=22,
dockerized=True,
ssh_proxy=None,
backend_data=vm_data["ip_address"],
backend_data=HotAisleInstanceBackendData(
ip_address=vm_data["ip_address"], vm_id=vm_data["name"]
).json(),
)

def update_provisioning_data(
Expand All @@ -129,7 +148,8 @@ def update_provisioning_data(
vm_state = self.api_client.get_vm_state(provisioning_data.instance_id)
if vm_state == "running":
if provisioning_data.hostname is None and provisioning_data.backend_data:
provisioning_data.hostname = provisioning_data.backend_data
backend_data = HotAisleInstanceBackendData.load(provisioning_data.backend_data)
provisioning_data.hostname = backend_data.ip_address
commands = get_shim_commands(
authorized_keys=[project_ssh_public_key],
arch=provisioning_data.instance_type.resources.cpu_arch,
Expand Down Expand Up @@ -211,3 +231,13 @@ def _run_ssh_command(hostname: str, ssh_private_key: str, command: str):
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)


class HotAisleInstanceBackendData(CoreModel):
ip_address: str
vm_id: Optional[str] = None
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(nit) This field is unused, so I wouldn't include it in the model


@classmethod
def load(cls, raw: Optional[str]) -> "HotAisleInstanceBackendData":
assert raw is not None
return cls.__response__.parse_raw(raw)
Loading
Loading