Skip to content

Commit 823aa50

Browse files
huksleyclaudeRuslan Gainutdinovshamrinfreetonik
authored
feat: cluster API support for Verda Cloud Python SDK (#70)
* Add Clusters API wrapper Implemented comprehensive support for the Clusters API: - Created ClustersService with methods for cluster management (create, get, delete, scale) - Added Cluster and ClusterNode dataclasses for type safety - Integrated clusters service into VerdaClient - Added ClusterStatus constants for cluster lifecycle management - Created comprehensive unit tests (13 tests covering all API operations) - Added detailed example demonstrating cluster operations - All tests pass (125/125) * feat: clusters api * fix: unit test * fix: polishing * fix: format, lint and unit test fixing * fix: integration tests * fix: review fixes * fix: full features cluster example, add integration test * fix: unit tests * fix: revert to the correct OS images in prod * remove "scale" verb * calling public API every 2 seconds is too much * add TODO comment about backoff logic reuse * keyword-only args, optional description, order to match instances.create * TODO comment in _instances module * import ClusterStatus directly Co-authored-by: Rakhim Davletkaliyev <rakhim@rakhim.org> * use ClusterStatus.RINNING Co-authored-by: Rakhim Davletkaliyev <rakhim@rakhim.org> * Update examples/clusters_example.py Co-authored-by: Rakhim Davletkaliyev <rakhim@rakhim.org> * clean cluster example * use isinstance Co-authored-by: Rakhim Davletkaliyev <rakhim@rakhim.org> * do not send not yet supported 'delete' action for multiple clusters * make ruff happy * remove dummy function * remove unneeded returns * remove unnecessary url variable --------- Co-authored-by: Claude <noreply@anthropic.com> Co-authored-by: Ruslan Gainutdinov <ruslan@datacrunch.io> Co-authored-by: Alexey Shamrin <alexey@datacrunch.io> Co-authored-by: Rakhim Davletkaliyev <rakhim@rakhim.org>
1 parent fc3c4f5 commit 823aa50

File tree

10 files changed

+689
-2
lines changed

10 files changed

+689
-2
lines changed

examples/clusters_example.py

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
"""
2+
Example demonstrating how to use the Clusters API.
3+
4+
This example shows how to:
5+
- Create a new compute cluster
6+
- List all clusters
7+
- Get a specific cluster by ID
8+
- Get cluster nodes
9+
- Delete a cluster
10+
"""
11+
12+
import os
13+
import time
14+
15+
from verda import VerdaClient
16+
from verda.constants import Actions, ClusterStatus, Locations
17+
18+
# Get credentials from environment variables
19+
CLIENT_ID = os.environ.get('VERDA_CLIENT_ID')
20+
CLIENT_SECRET = os.environ.get('VERDA_CLIENT_SECRET')
21+
BASE_URL = os.environ.get('VERDA_BASE_URL', 'https://api.verda.com/v1')
22+
23+
# Create client
24+
verda = VerdaClient(CLIENT_ID, CLIENT_SECRET, base_url=BASE_URL)
25+
26+
27+
def create_cluster_example():
28+
"""Create a new compute cluster."""
29+
# Get SSH keys
30+
ssh_keys = [key.id for key in verda.ssh_keys.get()]
31+
32+
# Check if cluster type is available
33+
if not verda.clusters.is_available('16B200', Locations.FIN_03):
34+
raise ValueError('Cluster type 16B200 is not available in FIN_03')
35+
36+
# Get available images for cluster type
37+
images = verda.clusters.get_cluster_images('16B200')
38+
if 'ubuntu-22.04-cuda-12.9-cluster' not in images:
39+
raise ValueError('Ubuntu 22.04 CUDA 12.9 cluster image is not supported for 16B200')
40+
41+
# Create a 16B200 cluster
42+
cluster = verda.clusters.create(
43+
hostname='my-compute-cluster',
44+
cluster_type='16B200',
45+
image='ubuntu-22.04-cuda-12.9-cluster',
46+
description='Example compute cluster for distributed training',
47+
ssh_key_ids=ssh_keys,
48+
location=Locations.FIN_03,
49+
shared_volume_name='my-shared-volume',
50+
shared_volume_size=30000,
51+
wait_for_status=None,
52+
)
53+
54+
print(f'Creating cluster: {cluster.id}')
55+
print(f'Cluster hostname: {cluster.hostname}')
56+
print(f'Cluster status: {cluster.status}')
57+
print(f'Cluster cluster_type: {cluster.cluster_type}')
58+
print(f'Location: {cluster.location}')
59+
60+
# Wait for cluster to enter RUNNING status
61+
while cluster.status != ClusterStatus.RUNNING:
62+
time.sleep(30)
63+
print(f'Waiting for cluster to enter RUNNING status... (status: {cluster.status})')
64+
cluster = verda.clusters.get_by_id(cluster.id)
65+
66+
print(f'Public IP: {cluster.ip}')
67+
print('Cluster is now running and ready to use!')
68+
69+
return cluster
70+
71+
72+
def list_clusters_example():
73+
"""List all clusters."""
74+
# Get all clusters
75+
clusters = verda.clusters.get()
76+
77+
print(f'\nFound {len(clusters)} cluster(s):')
78+
for cluster in clusters:
79+
print(
80+
f' - {cluster.hostname} ({cluster.id}): {cluster.status} - {len(cluster.worker_nodes)} nodes'
81+
)
82+
83+
# Get clusters with specific status
84+
running_clusters = verda.clusters.get(status=ClusterStatus.RUNNING)
85+
print(f'\nFound {len(running_clusters)} running cluster(s)')
86+
87+
return clusters
88+
89+
90+
def get_cluster_by_id_example(cluster_id: str):
91+
"""Get a specific cluster by ID."""
92+
cluster = verda.clusters.get_by_id(cluster_id)
93+
94+
print('\nCluster details:')
95+
print(f' ID: {cluster.id}')
96+
print(f' Name: {cluster.hostname}')
97+
print(f' Description: {cluster.description}')
98+
print(f' Status: {cluster.status}')
99+
print(f' Cluster type: {cluster.cluster_type}')
100+
print(f' Created at: {cluster.created_at}')
101+
print(f' Public IP: {cluster.ip}')
102+
print(f' Worker nodes: {len(cluster.worker_nodes)}')
103+
104+
return cluster
105+
106+
107+
def delete_cluster_example(cluster_id: str):
108+
"""Delete a cluster."""
109+
print(f'\nDeleting cluster {cluster_id}...')
110+
111+
verda.clusters.action(cluster_id, Actions.DELETE)
112+
113+
print('Cluster deleted successfully')
114+
115+
116+
def main():
117+
"""Run all cluster examples."""
118+
print('=== Clusters API Example ===\n')
119+
120+
print('Creating a new cluster...')
121+
cluster = create_cluster_example()
122+
cluster_id = cluster.id
123+
124+
print('\nListing all clusters...')
125+
list_clusters_example()
126+
127+
print('\nGetting cluster details...')
128+
get_cluster_by_id_example(cluster_id)
129+
130+
print('\nDeleting the cluster...')
131+
delete_cluster_example(cluster_id)
132+
133+
print('\n=== Example completed successfully ===')
134+
135+
136+
if __name__ == '__main__':
137+
main()

tests/integration_tests/conftest.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,11 @@
99
Make sure to run the server and the account has enough balance before running the tests
1010
"""
1111

12-
BASE_URL = 'http://localhost:3010/v1'
13-
1412
# Load env variables, make sure there's an env file with valid client credentials
1513
load_dotenv()
1614
CLIENT_SECRET = os.getenv('VERDA_CLIENT_SECRET')
1715
CLIENT_ID = os.getenv('VERDA_CLIENT_ID')
16+
BASE_URL = os.getenv('VERDA_BASE_URL', 'http://localhost:3010/v1')
1817

1918

2019
@pytest.fixture
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
import logging
2+
import os
3+
4+
import pytest
5+
6+
from verda import VerdaClient
7+
from verda.constants import Locations
8+
9+
logging.basicConfig(level=logging.DEBUG)
10+
logger = logging.getLogger()
11+
12+
13+
IN_GITHUB_ACTIONS = os.getenv('GITHUB_ACTIONS') == 'true'
14+
15+
16+
@pytest.mark.skipif(IN_GITHUB_ACTIONS, reason="Test doesn't work in Github Actions.")
17+
@pytest.mark.withoutresponses
18+
class TestClusters:
19+
def test_create_cluster(self, verda_client: VerdaClient):
20+
# get ssh key
21+
ssh_key = verda_client.ssh_keys.get()[0]
22+
23+
if not verda_client.clusters.is_available('16B200', Locations.FIN_03):
24+
raise ValueError('Cluster type 16B200 is not available in FIN_03')
25+
logger.debug('[x] Cluster type 16B200 is available in FIN_03')
26+
27+
availabilities = verda_client.clusters.get_availabilities(Locations.FIN_03)
28+
assert len(availabilities) > 0
29+
assert '16B200' in availabilities
30+
logger.debug(
31+
'[x] Cluster type 16B200 is one of the available cluster types in FIN_03: %s',
32+
availabilities,
33+
)
34+
35+
images = verda_client.clusters.get_cluster_images('16B200')
36+
assert len(images) > 0
37+
assert 'ubuntu-22.04-cuda-12.9-cluster' in images
38+
logger.debug('[x] Ubuntu 22.04 CUDA 12.9 cluster image is supported for 16B200')
39+
40+
# create instance
41+
cluster = verda_client.clusters.create(
42+
hostname='test-instance',
43+
location=Locations.FIN_03,
44+
cluster_type='16B200',
45+
description='test instance',
46+
image='ubuntu-22.04-cuda-12.9-cluster',
47+
ssh_key_ids=[ssh_key.id],
48+
# Set to None to not wait for provisioning but return immediately
49+
wait_for_status=verda_client.constants.cluster_status.PROVISIONING,
50+
)
51+
52+
# assert instance is created
53+
assert cluster.id is not None
54+
assert (
55+
cluster.status == verda_client.constants.cluster_status.PROVISIONING
56+
or cluster.status == verda_client.constants.cluster_status.RUNNING
57+
)
58+
59+
# If still provisioning, we don't have worker nodes yet and ip is not available
60+
if cluster.status != verda_client.constants.instance_status.PROVISIONING:
61+
assert cluster.worker_nodes is not None
62+
assert len(cluster.worker_nodes) == 2
63+
assert cluster.ip is not None
64+
65+
# Now we need to wait for RUNNING status to connect to the jumphost (public IP is available)
66+
# After that, we can connect to the jumphost and run commands on the cluster nodes:
67+
#
68+
# ssh -i ssh_key.pem root@<public_ip>
69+
#

tests/unit_tests/clusters/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)