-
-
Notifications
You must be signed in to change notification settings - Fork 222
add hotaisle backend #2935
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
add hotaisle backend #2935
Changes from 1 commit
f5f2f2e
b9ca0be
9558c29
1bf72e0
16e77e3
d2846ae
c19065b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,14 +2,15 @@ | |
|
|
||
| import requests | ||
|
|
||
| from dstack._internal.core.backends.base.configurator import raise_invalid_credentials_error | ||
| from dstack._internal.utils.logging import get_logger | ||
|
|
||
| API_URL = "https://admin.hotaisle.app/api" | ||
|
|
||
| logger = get_logger(__name__) | ||
|
|
||
|
|
||
| class HotaisleAPIClient: | ||
| class HotAisleAPIClient: | ||
| def __init__(self, api_key: str, team_handle: str): | ||
| self.api_key = api_key | ||
| self.team_handle = team_handle | ||
|
|
@@ -19,28 +20,42 @@ def validate_api_key(self) -> bool: | |
| self._validate_user_and_team() | ||
| return True | ||
| except requests.HTTPError as e: | ||
| if e.response.status_code in [401, 403]: | ||
| return False | ||
| if e.response.status_code == 401: | ||
| raise_invalid_credentials_error( | ||
| fields=[["creds", "api_key"]], details="Invalid API key" | ||
| ) | ||
| elif e.response.status_code == 403: | ||
| raise_invalid_credentials_error( | ||
| fields=[["creds", "api_key"]], | ||
| details="Authenticated user does note have required permissions", | ||
| ) | ||
| raise e | ||
| except ValueError as e: | ||
| error_message = str(e) | ||
| if "No Hot Aisle teams found" in error_message: | ||
| raise_invalid_credentials_error( | ||
| fields=[["creds", "api_key"]], | ||
| details="Valid API key but no teams found for this user", | ||
| ) | ||
| elif "not found" in error_message: | ||
| raise_invalid_credentials_error( | ||
| fields=[["team_handle"]], details=f"Team handle '{self.team_handle}' not found" | ||
| ) | ||
|
Comment on lines
+33
to
+43
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (nit) Looking for patterns in our own error messages and then raising with another error message looks quite redundant. It's also error-prone, because we can change the error message in Some alternatives I can suggest:
|
||
| raise e | ||
| except ValueError: | ||
| return False | ||
|
|
||
| def _validate_user_and_team(self) -> None: | ||
| url = f"{API_URL}/user/" | ||
| response = self._make_request("GET", url) | ||
|
|
||
| if response.ok: | ||
| user_data = response.json() | ||
| else: | ||
| response.raise_for_status() | ||
| response.raise_for_status() | ||
| user_data = response.json() | ||
|
|
||
| teams = user_data.get("teams", []) | ||
| if not teams: | ||
| raise ValueError("No Hotaisle teams found for this user") | ||
| raise ValueError("No Hot Aisle teams found for this user") | ||
|
|
||
| available_teams = [team["handle"] for team in teams] | ||
| if self.team_handle not in available_teams: | ||
| raise ValueError(f"Hotaisle Team '{self.team_handle}' not found.") | ||
| raise ValueError(f"Hot Aisle team '{self.team_handle}' not found.") | ||
|
|
||
| def upload_ssh_key(self, public_key: str) -> bool: | ||
| url = f"{API_URL}/user/ssh_keys/" | ||
|
|
@@ -50,47 +65,36 @@ def upload_ssh_key(self, public_key: str) -> bool: | |
|
|
||
| if response.status_code == 409: | ||
| return True # Key already exists - success | ||
| if not response.ok: | ||
| response.raise_for_status() | ||
| response.raise_for_status() | ||
| return True | ||
|
|
||
| def create_virtual_machine( | ||
| self, vm_payload: Dict[str, Any], instance_name: str | ||
| ) -> Dict[str, Any]: | ||
| url = f"{API_URL}/teams/{self.team_handle}/virtual_machines/" | ||
| response = self._make_request("POST", url, json=vm_payload) | ||
|
|
||
| if not response.ok: | ||
| response.raise_for_status() | ||
|
|
||
| response.raise_for_status() | ||
| vm_data = response.json() | ||
| return vm_data | ||
|
|
||
| def get_vm_state(self, vm_name: str) -> str: | ||
| url = f"{API_URL}/teams/{self.team_handle}/virtual_machines/{vm_name}/state/" | ||
| response = self._make_request("GET", url) | ||
|
|
||
| if not response.ok: | ||
| response.raise_for_status() | ||
|
|
||
| response.raise_for_status() | ||
| state_data = response.json() | ||
| return state_data["state"] | ||
|
|
||
| def terminate_virtual_machine(self, vm_name: str) -> bool: | ||
| def terminate_virtual_machine(self, vm_name: str) -> None: | ||
| url = f"{API_URL}/teams/{self.team_handle}/virtual_machines/{vm_name}/" | ||
| response = self._make_request("DELETE", url) | ||
|
|
||
| if response.status_code == 204: | ||
| return True | ||
| else: | ||
| response.raise_for_status() | ||
| response.raise_for_status() | ||
|
|
||
| def _make_request( | ||
| self, method: str, url: str, json: Optional[Dict[str, Any]] = None, timeout: int = 30 | ||
| ) -> requests.Response: | ||
| headers = { | ||
| "accept": "application/json", | ||
| "Authorization": self.api_key, | ||
| "Authorization": f"Token {self.api_key}", | ||
| } | ||
| if json is not None: | ||
| headers["Content-Type"] = "application/json" | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,16 +1,16 @@ | ||
| from dstack._internal.core.backends.base.backend import Backend | ||
| from dstack._internal.core.backends.hotaisle.compute import HotaisleCompute | ||
| from dstack._internal.core.backends.hotaisle.models import HotaisleConfig | ||
| from dstack._internal.core.backends.hotaisle.compute import HotAisleCompute | ||
| from dstack._internal.core.backends.hotaisle.models import HotAisleConfig | ||
| from dstack._internal.core.models.backends.base import BackendType | ||
|
|
||
|
|
||
| class HotaisleBackend(Backend): | ||
| class HotAisleBackend(Backend): | ||
| TYPE = BackendType.HOTAISLE | ||
| COMPUTE_CLASS = HotaisleCompute | ||
| COMPUTE_CLASS = HotAisleCompute | ||
|
|
||
| def __init__(self, config: HotaisleConfig): | ||
| def __init__(self, config: HotAisleConfig): | ||
| self.config = config | ||
| self._compute = HotaisleCompute(self.config) | ||
| self._compute = HotAisleCompute(self.config) | ||
|
|
||
| def compute(self) -> HotaisleCompute: | ||
| def compute(self) -> HotAisleCompute: | ||
| return self._compute |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,9 +14,10 @@ | |
| get_shim_commands, | ||
| ) | ||
| from dstack._internal.core.backends.base.offers import get_catalog_offers | ||
| from dstack._internal.core.backends.hotaisle.api_client import HotaisleAPIClient | ||
| from dstack._internal.core.backends.hotaisle.models import HotaisleConfig | ||
| from dstack._internal.core.backends.hotaisle.api_client import HotAisleAPIClient | ||
| from dstack._internal.core.backends.hotaisle.models import HotAisleConfig | ||
| from dstack._internal.core.models.backends.base import BackendType | ||
| from dstack._internal.core.models.common import CoreModel | ||
| from dstack._internal.core.models.instances import ( | ||
| InstanceAvailability, | ||
| InstanceConfiguration, | ||
|
|
@@ -31,14 +32,28 @@ | |
| MAX_INSTANCE_NAME_LEN = 60 | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (nit) Unused |
||
|
|
||
|
|
||
| class HotaisleCompute( | ||
| INSTANCE_TYPE_SPECS = { | ||
| "1x MI300X 8x Xeon Platinum 8462Y+": { | ||
| "cpu_model": "Xeon Platinum 8462Y+", | ||
| "cpu_frequency": 2800000000, | ||
| "cpu_manufacturer": "Intel", | ||
| }, | ||
| "1x MI300X 13x Xeon Platinum 8470": { | ||
| "cpu_model": "Xeon Platinum 8470", | ||
| "cpu_frequency": 2000000000, | ||
| "cpu_manufacturer": "Intel", | ||
| }, | ||
| } | ||
|
|
||
|
|
||
| class HotAisleCompute( | ||
| ComputeWithCreateInstanceSupport, | ||
| Compute, | ||
| ): | ||
| def __init__(self, config: HotaisleConfig): | ||
| def __init__(self, config: HotAisleConfig): | ||
| super().__init__() | ||
| self.config = config | ||
| self.api_client = HotaisleAPIClient(config.creds.api_key, config.team_handle) | ||
| self.api_client = HotAisleAPIClient(config.creds.api_key, config.team_handle) | ||
| self.catalog = gpuhunt.Catalog(balance_resources=False, auto_reload=False) | ||
| self.catalog.add_provider( | ||
| HotAisleProvider(api_key=config.creds.api_key, team_handle=config.team_handle) | ||
|
|
@@ -53,41 +68,43 @@ def get_offers( | |
| requirements=requirements, | ||
| catalog=self.catalog, | ||
| ) | ||
| offers = [ | ||
| InstanceOfferWithAvailability( | ||
| **offer.dict(), availability=InstanceAvailability.AVAILABLE | ||
| ) | ||
| for offer in offers | ||
| ] | ||
| return offers | ||
|
|
||
| supported_offers = [] | ||
| for offer in offers: | ||
| if offer.instance.name in INSTANCE_TYPE_SPECS: | ||
| supported_offers.append( | ||
| InstanceOfferWithAvailability( | ||
| **offer.dict(), availability=InstanceAvailability.AVAILABLE | ||
| ) | ||
| ) | ||
| else: | ||
| logger.warning( | ||
| f"Skipping unsupported Hot Aisle instance type: {offer.instance.name}" | ||
| ) | ||
|
|
||
| return supported_offers | ||
|
|
||
| def get_payload_from_offer(self, instance_type) -> dict: | ||
| # Only two instance types are available in Hotaisle with CPUs: 8-core and 13-core. Other fields are | ||
| # not configurable. | ||
| instance_type_name = instance_type.name | ||
| cpu_specs = INSTANCE_TYPE_SPECS[instance_type_name] | ||
| cpu_cores = instance_type.resources.cpus | ||
| if cpu_cores == 8: | ||
| cpu_model = "Xeon Platinum 8462Y+" | ||
| frequency = 2800000000 | ||
| else: # cpu_cores == 13 | ||
| cpu_model = "Xeon Platinum 8470" | ||
| frequency = 2000000000 | ||
|
|
||
| return { | ||
| "cpu_cores": cpu_cores, | ||
| "cpus": { | ||
| "count": 1, | ||
| "manufacturer": "Intel", | ||
| "model": cpu_model, | ||
| "manufacturer": cpu_specs["cpu_manufacturer"], | ||
| "model": cpu_specs["cpu_model"], | ||
| "cores": cpu_cores, | ||
| "frequency": frequency, | ||
| "frequency": cpu_specs["cpu_frequency"], | ||
| }, | ||
| "disk_capacity": 13194139533312, | ||
| "ram_capacity": 240518168576, | ||
| "disk_capacity": instance_type.resources.disk.size_mib * 1024**2, | ||
| "ram_capacity": instance_type.resources.memory_mib * 1024**2, | ||
| "gpus": [ | ||
| { | ||
| "count": len(instance_type.resources.gpus), | ||
| "manufacturer": "AMD", | ||
| "model": "MI300X", | ||
| "manufacturer": instance_type.resources.gpus[0].vendor, | ||
| "model": instance_type.resources.gpus[0].name, | ||
| } | ||
| ], | ||
| } | ||
|
|
@@ -117,7 +134,9 @@ def create_instance( | |
| ssh_port=22, | ||
| dockerized=True, | ||
| ssh_proxy=None, | ||
| backend_data=vm_data["ip_address"], | ||
| backend_data=HotAisleInstanceBackendData( | ||
| ip_address=vm_data["ip_address"], vm_id=vm_data["name"] | ||
| ).json(), | ||
| ) | ||
|
|
||
| def update_provisioning_data( | ||
|
|
@@ -129,7 +148,8 @@ def update_provisioning_data( | |
| vm_state = self.api_client.get_vm_state(provisioning_data.instance_id) | ||
| if vm_state == "running": | ||
| if provisioning_data.hostname is None and provisioning_data.backend_data: | ||
| provisioning_data.hostname = provisioning_data.backend_data | ||
| backend_data = HotAisleInstanceBackendData.load(provisioning_data.backend_data) | ||
| provisioning_data.hostname = backend_data.ip_address | ||
| commands = get_shim_commands( | ||
| authorized_keys=[project_ssh_public_key], | ||
| arch=provisioning_data.instance_type.resources.cpu_arch, | ||
|
|
@@ -211,3 +231,13 @@ def _run_ssh_command(hostname: str, ssh_private_key: str, command: str): | |
| stdout=subprocess.DEVNULL, | ||
| stderr=subprocess.DEVNULL, | ||
| ) | ||
|
|
||
|
|
||
| class HotAisleInstanceBackendData(CoreModel): | ||
| ip_address: str | ||
| vm_id: Optional[str] = None | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (nit) This field is unused, so I wouldn't include it in the model |
||
|
|
||
| @classmethod | ||
| def load(cls, raw: Optional[str]) -> "HotAisleInstanceBackendData": | ||
| assert raw is not None | ||
| return cls.__response__.parse_raw(raw) | ||
Uh oh!
There was an error while loading. Please reload this page.