Skip to content

Commit f009010

Browse files
author
Ruslan Gainutdinov
committed
fix: review fixes
1 parent 471e089 commit f009010

3 files changed

Lines changed: 104 additions & 24 deletions

File tree

examples/clusters_example.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,23 +28,31 @@ def create_cluster_example():
2828
# Get SSH keys
2929
ssh_keys = [key.id for key in verda.ssh_keys.get()]
3030

31-
# Create a cluster with 3 nodes
31+
# Check if cluster type is available
32+
if not verda.clusters.is_available('16B200', Locations.FIN_03):
33+
raise ValueError('Cluster type 16B200 is not available in FIN_03')
34+
35+
# Get available images for cluster type
36+
images = verda.clusters.get_cluster_images('16B200')
37+
if 'ubuntu-22.04-cuda-12.9-cluster' not in images:
38+
raise ValueError('Ubuntu 22.04 CUDA 12.9 cluster image is not supported for 16B200')
39+
40+
# Create a 16B200 cluster
3241
cluster = verda.clusters.create(
3342
hostname='my-compute-cluster',
34-
cluster_type='16H200',
35-
image='ubuntu-22.04-cuda-12.4-cluster',
43+
cluster_type='16B200',
44+
image='ubuntu-22.04-cuda-12.9-cluster',
3645
description='Example compute cluster for distributed training',
3746
ssh_key_ids=ssh_keys,
3847
location=Locations.FIN_03,
3948
shared_volume_name='my-shared-volume',
4049
shared_volume_size=30000,
4150
)
4251

43-
print(f'Created cluster: {cluster.id}')
52+
print(f'Creating cluster: {cluster.id}')
4453
print(f'Cluster hostname: {cluster.hostname}')
4554
print(f'Cluster status: {cluster.status}')
4655
print(f'Cluster cluster_type: {cluster.cluster_type}')
47-
print(f'Cluster worker_nodes: {cluster.worker_nodes}')
4856
print(f'Location: {cluster.location}')
4957

5058
return cluster
Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,15 @@
1+
import logging
12
import os
23

34
import pytest
45

56
from verda import VerdaClient
67
from verda.constants import Locations
78

9+
logging.basicConfig(level=logging.DEBUG)
10+
logger = logging.getLogger()
11+
12+
813
IN_GITHUB_ACTIONS = os.getenv('GITHUB_ACTIONS') == 'true'
914

1015

@@ -15,21 +20,40 @@ def test_create_cluster(self, verda_client: VerdaClient):
1520
# get ssh key
1621
ssh_key = verda_client.ssh_keys.get()[0]
1722

23+
if not verda_client.clusters.is_available('16B200', Locations.FIN_03):
24+
raise ValueError('Cluster type 16B200 is not available in FIN_03')
25+
logger.debug('[x] Cluster type 16B200 is available in FIN_03')
26+
27+
availabilities = verda_client.clusters.get_availabilities(Locations.FIN_03)
28+
assert len(availabilities) > 0
29+
assert '16B200' in availabilities
30+
logger.debug(
31+
'[x] Cluster type 16B200 is one of the available cluster types in FIN_03: %s',
32+
availabilities,
33+
)
34+
35+
images = verda_client.clusters.get_cluster_images('16B200')
36+
assert len(images) > 0
37+
assert 'ubuntu-22.04-cuda-12.9-cluster' in images
38+
logger.debug('[x] Ubuntu 22.04 CUDA 12.9 cluster image is supported for 16B200')
39+
1840
# create instance
1941
cluster = verda_client.clusters.create(
2042
hostname='test-instance',
2143
location=Locations.FIN_03,
2244
cluster_type='16B200',
2345
description='test instance',
24-
image='ubuntu-22.04-cuda-12.8-cluster',
46+
image='ubuntu-22.04-cuda-12.9-cluster',
2547
ssh_key_ids=[ssh_key.id],
48+
# Set to None to not wait for provisioning but return immediately
49+
wait_for_status=verda_client.constants.cluster_status.PROVISIONING,
2650
)
2751

2852
# assert instance is created
2953
assert cluster.id is not None
3054
assert (
31-
cluster.status == verda_client.constants.instance_status.PROVISIONING
32-
or cluster.status == verda_client.constants.instance_status.RUNNING
55+
cluster.status == verda_client.constants.cluster_status.PROVISIONING
56+
or cluster.status == verda_client.constants.cluster_status.RUNNING
3357
)
3458

3559
# If still provisioning, we don't have worker nodes yet and ip is not available
@@ -38,7 +62,11 @@ def test_create_cluster(self, verda_client: VerdaClient):
3862
assert len(cluster.worker_nodes) == 2
3963
assert cluster.ip is not None
4064

41-
print(cluster)
65+
print(f'Creating cluster: {cluster.id}')
66+
print(f'Cluster hostname: {cluster.hostname}')
67+
print(f'Cluster status: {cluster.status}')
68+
print(f'Cluster cluster_type: {cluster.cluster_type}')
69+
print(f'Location: {cluster.location}')
4270

4371
# delete instance
4472
# verda_client.clusters.action(cluster.id, 'delete')

verda/clusters/_clusters.py

Lines changed: 59 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,25 @@
11
import itertools
22
import time
33
from dataclasses import dataclass
4-
from typing import Literal
54

65
from dataclasses_json import dataclass_json
76

8-
from verda.constants import Actions, Locations
7+
from verda.constants import Actions, ClusterStatus, ErrorCodes, Locations
8+
from verda.exceptions import APIException
99

1010
CLUSTERS_ENDPOINT = '/clusters'
1111

12+
# Default shared volume size is 30TB
13+
DEFAULT_SHARED_VOLUME_SIZE = 30000
14+
1215

1316
@dataclass_json
1417
@dataclass
1518
class ClusterWorkerNode:
1619
"""Represents a worker node in a cluster.
1720
1821
Attributes:
19-
id: Unique identifier for the node (instance ID).
22+
id: Unique identifier for the node.
2023
status: Current status of the node.
2124
hostname: Network hostname of the node.
2225
private_ip: Private IP address of the node.
@@ -40,7 +43,7 @@ class Cluster:
4043
status: Current operational status of the cluster.
4144
created_at: Timestamp of cluster creation.
4245
location: Datacenter location code (default: Locations.FIN_03).
43-
cluster_type: Type of instances used for cluster nodes.
46+
cluster_type: Type of the cluster.
4447
worker_nodes: List of nodes in the cluster.
4548
ssh_key_ids: List of SSH key IDs associated with the cluster nodes.
4649
image: Image ID or type used for cluster nodes.
@@ -65,7 +68,7 @@ class Cluster:
6568
class ClustersService:
6669
"""Service for managing compute clusters through the API.
6770
68-
This service provides methods to create, retrieve, scale, and manage compute clusters.
71+
This service provides methods to create, retrieve, and manage compute clusters.
6972
"""
7073

7174
def __init__(self, http_client) -> None:
@@ -118,6 +121,7 @@ def create(
118121
shared_volume_name: str | None = None,
119122
shared_volume_size: int | None = None,
120123
*,
124+
wait_for_status: str | None = ClusterStatus.PROVISIONING,
121125
max_wait_time: float = 900,
122126
initial_interval: float = 1.0,
123127
max_interval: float = 10,
@@ -135,6 +139,7 @@ def create(
135139
startup_script_id: Optional ID of startup script to run on nodes.
136140
shared_volume_name: Optional name for the shared volume.
137141
shared_volume_size: Optional size for the shared volume, in GB, default to 30TB.
142+
wait_for_status: Status to wait for the cluster to reach, default to PROVISIONING. If None, no wait is performed.
138143
max_wait_time: Maximum total wait for the cluster to start creating, in seconds (default: 900)
139144
initial_interval: Initial interval, in seconds (default: 1.0)
140145
max_interval: The longest single delay allowed between retries, in seconds (default: 10)
@@ -158,19 +163,28 @@ def create(
158163
'startup_script_id': startup_script_id,
159164
'shared_volume': {
160165
'name': shared_volume_name if shared_volume_name else hostname + '-shared-volume',
161-
'size': shared_volume_size if shared_volume_size else 30000,
166+
'size': shared_volume_size if shared_volume_size else DEFAULT_SHARED_VOLUME_SIZE,
162167
},
163168
}
164169
response = self._http_client.post(CLUSTERS_ENDPOINT, json=payload).json()
165170
id = response['id']
166171

172+
if not wait_for_status:
173+
return self.get_by_id(id)
174+
167175
# Wait for cluster to enter creating state with timeout
168176
deadline = time.monotonic() + max_wait_time
169177
for i in itertools.count():
170178
cluster = self.get_by_id(id)
171-
if cluster.status != 'ordered':
179+
if cluster.status == wait_for_status:
172180
return cluster
173181

182+
if cluster.status == ClusterStatus.ERROR:
183+
raise APIException(ErrorCodes.SERVER_ERROR, f'Cluster {id} entered error state')
184+
185+
if cluster.status == ClusterStatus.DISCONTINUED:
186+
raise APIException(ErrorCodes.SERVER_ERROR, f'Cluster {id} was discontinued')
187+
174188
now = time.monotonic()
175189
if now >= deadline:
176190
raise TimeoutError(
@@ -181,10 +195,10 @@ def create(
181195
time.sleep(interval)
182196

183197
def action(self, id_list: list[str] | str, action: str) -> None:
184-
"""Performs an action on one or more instances.
198+
"""Performs an action on one or more clusters.
185199
186200
Args:
187-
id_list: Single instance ID or list of instance IDs to act upon.
201+
id_list: Single cluster ID or list of cluster IDs to act upon.
188202
action: Action to perform on the clusters. Only `delete` is supported.
189203
190204
Raises:
@@ -215,20 +229,21 @@ def is_available(
215229
cluster_type: str,
216230
location_code: str | None = None,
217231
) -> bool:
218-
"""Checks if a specific instance type is available for deployment.
232+
"""Checks if a specific cluster type is available for deployment.
219233
220234
Args:
221-
cluster_type: Type of instance to check availability for.
235+
cluster_type: Type of cluster to check availability for.
222236
location_code: Optional datacenter location code.
223237
224238
Returns:
225-
True if the instance type is available, False otherwise.
239+
True if the cluster type is available, False otherwise.
226240
"""
227241
query_params = {'location_code': location_code}
228242
url = f'/cluster-availability/{cluster_type}'
229-
return self._http_client.get(url, query_params).json()
243+
response = self._http_client.get(url, query_params).text
244+
return response == 'true'
230245

231-
def get_availabilities(self, location_code: str | None = None) -> list[dict]:
246+
def get_availabilities(self, location_code: str | None = None) -> list[str]:
232247
"""Retrieves a list of available cluster types across locations.
233248
234249
Args:
@@ -238,4 +253,33 @@ def get_availabilities(self, location_code: str | None = None) -> list[dict]:
238253
List of available cluster types and their details.
239254
"""
240255
query_params = {'location_code': location_code}
241-
return self._http_client.get('/cluster-availability', params=query_params).json()
256+
response = self._http_client.get('/cluster-availability', params=query_params).json()
257+
availabilities = response[0]['availabilities']
258+
return availabilities
259+
260+
def get_availability(self, cluster_type: str, location_code: str | None = None) -> list[dict]:
261+
"""Checks if a specific cluster type is available for deployment.
262+
263+
Args:
264+
cluster_type: Type of cluster to check availability for.
265+
location_code: Optional datacenter location code.
266+
267+
Returns:
268+
True if the cluster type is available, False otherwise.
269+
"""
270+
271+
def get_cluster_images(
272+
self,
273+
cluster_type: str | None = None,
274+
) -> list[str]:
275+
"""Retrieves a list of available images for a given cluster type (optional).
276+
277+
Args:
278+
cluster_type: Type of cluster to get images for.
279+
280+
Returns:
281+
List of available images for the given cluster type.
282+
"""
283+
query_params = {'instance_type': cluster_type}
284+
images = self._http_client.get('/images/cluster', params=query_params).json()
285+
return [image['image_type'] for image in images]

0 commit comments

Comments
 (0)