-
Notifications
You must be signed in to change notification settings - Fork 12
Expand file tree
/
Copy pathclusters_example.py
More file actions
137 lines (101 loc) · 4.02 KB
/
clusters_example.py
File metadata and controls
137 lines (101 loc) · 4.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
"""
Example demonstrating how to use the Clusters API.
This example shows how to:
- Create a new compute cluster
- List all clusters
- Get a specific cluster by ID
- Get cluster nodes
- Delete a cluster
"""
import os
import time
from verda import VerdaClient
from verda.constants import Actions, ClusterStatus, Locations
# Get credentials from environment variables
CLIENT_ID = os.environ.get('VERDA_CLIENT_ID')
CLIENT_SECRET = os.environ.get('VERDA_CLIENT_SECRET')
BASE_URL = os.environ.get('VERDA_BASE_URL', 'https://api.verda.com/v1')
# Create client
verda = VerdaClient(CLIENT_ID, CLIENT_SECRET, base_url=BASE_URL)
def create_cluster_example():
"""Create a new compute cluster."""
# Get SSH keys
ssh_keys = [key.id for key in verda.ssh_keys.get()]
# Check if cluster type is available
if not verda.clusters.is_available('16B200', Locations.FIN_03):
raise ValueError('Cluster type 16B200 is not available in FIN_03')
# Get available images for cluster type
images = verda.clusters.get_cluster_images('16B200')
if 'ubuntu-22.04-cuda-12.9-cluster' not in images:
raise ValueError('Ubuntu 22.04 CUDA 12.9 cluster image is not supported for 16B200')
# Create a 16B200 cluster
cluster = verda.clusters.create(
hostname='my-compute-cluster',
cluster_type='16B200',
image='ubuntu-22.04-cuda-12.9-cluster',
description='Example compute cluster for distributed training',
ssh_key_ids=ssh_keys,
location=Locations.FIN_03,
shared_volume_name='my-shared-volume',
shared_volume_size=30000,
wait_for_status=None,
)
print(f'Creating cluster: {cluster.id}')
print(f'Cluster hostname: {cluster.hostname}')
print(f'Cluster status: {cluster.status}')
print(f'Cluster cluster_type: {cluster.cluster_type}')
print(f'Location: {cluster.location}')
# Wait for cluster to enter RUNNING status
while cluster.status != ClusterStatus.RUNNING:
time.sleep(30)
print(f'Waiting for cluster to enter RUNNING status... (status: {cluster.status})')
cluster = verda.clusters.get_by_id(cluster.id)
print(f'Public IP: {cluster.ip}')
print('Cluster is now running and ready to use!')
return cluster
def list_clusters_example():
"""List all clusters."""
# Get all clusters
clusters = verda.clusters.get()
print(f'\nFound {len(clusters)} cluster(s):')
for cluster in clusters:
print(
f' - {cluster.hostname} ({cluster.id}): {cluster.status} - {len(cluster.worker_nodes)} nodes'
)
# Get clusters with specific status
running_clusters = verda.clusters.get(status=ClusterStatus.RUNNING)
print(f'\nFound {len(running_clusters)} running cluster(s)')
return clusters
def get_cluster_by_id_example(cluster_id: str):
"""Get a specific cluster by ID."""
cluster = verda.clusters.get_by_id(cluster_id)
print('\nCluster details:')
print(f' ID: {cluster.id}')
print(f' Name: {cluster.hostname}')
print(f' Description: {cluster.description}')
print(f' Status: {cluster.status}')
print(f' Cluster type: {cluster.cluster_type}')
print(f' Created at: {cluster.created_at}')
print(f' Public IP: {cluster.ip}')
print(f' Worker nodes: {len(cluster.worker_nodes)}')
return cluster
def delete_cluster_example(cluster_id: str):
"""Delete a cluster."""
print(f'\nDeleting cluster {cluster_id}...')
verda.clusters.action(cluster_id, Actions.DELETE)
print('Cluster deleted successfully')
def main():
"""Run all cluster examples."""
print('=== Clusters API Example ===\n')
print('Creating a new cluster...')
cluster = create_cluster_example()
cluster_id = cluster.id
print('\nListing all clusters...')
list_clusters_example()
print('\nGetting cluster details...')
get_cluster_by_id_example(cluster_id)
print('\nDeleting the cluster...')
delete_cluster_example(cluster_id)
print('\n=== Example completed successfully ===')
if __name__ == '__main__':
main()