-
Notifications
You must be signed in to change notification settings - Fork 13
Expand file tree
/
Copy pathclusters_example.py
More file actions
141 lines (105 loc) · 4.15 KB
/
clusters_example.py
File metadata and controls
141 lines (105 loc) · 4.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
"""
Example demonstrating how to use the Clusters API.
This example shows how to:
- Create a new compute cluster
- List all clusters
- Get a specific cluster by ID
- Get cluster nodes
- Delete a cluster
"""
import os
import time
from verda import VerdaClient
from verda.constants import Actions, Locations
# Get credentials from environment variables
CLIENT_ID = os.environ.get('VERDA_CLIENT_ID')
CLIENT_SECRET = os.environ.get('VERDA_CLIENT_SECRET')
BASE_URL = os.environ.get('VERDA_BASE_URL', 'https://api.verda.com/v1')
# Create client
verda = VerdaClient(CLIENT_ID, CLIENT_SECRET, base_url=BASE_URL)
def create_cluster_example():
"""Create a new compute cluster."""
# Get SSH keys
ssh_keys = [key.id for key in verda.ssh_keys.get()]
# Check if cluster type is available
if not verda.clusters.is_available('16B200', Locations.FIN_03):
raise ValueError('Cluster type 16B200 is not available in FIN_03')
# Get available images for cluster type
images = verda.clusters.get_cluster_images('16B200')
if 'ubuntu-22.04-cuda-12.9-cluster' not in images:
raise ValueError('Ubuntu 22.04 CUDA 12.9 cluster image is not supported for 16B200')
# Create a 16B200 cluster
cluster = verda.clusters.create(
hostname='my-compute-cluster',
cluster_type='16B200',
image='ubuntu-22.04-cuda-12.9-cluster',
description='Example compute cluster for distributed training',
ssh_key_ids=ssh_keys,
location=Locations.FIN_03,
shared_volume_name='my-shared-volume',
shared_volume_size=30000,
wait_for_status=None,
)
print(f'Creating cluster: {cluster.id}')
print(f'Cluster hostname: {cluster.hostname}')
print(f'Cluster status: {cluster.status}')
print(f'Cluster cluster_type: {cluster.cluster_type}')
print(f'Location: {cluster.location}')
# Wait for cluster to enter RUNNING status
while cluster.status != verda.constants.cluster_status.RUNNING:
time.sleep(30)
print(f'Waiting for cluster to enter RUNNING status... (status: {cluster.status})')
cluster = verda.clusters.get_by_id(cluster.id)
print(f'Public IP: {cluster.ip}')
print('Cluster is now running and ready to use!')
return cluster
def list_clusters_example():
"""List all clusters."""
# Get all clusters
clusters = verda.clusters.get()
print(f'\nFound {len(clusters)} cluster(s):')
for cluster in clusters:
print(
f' - {cluster.hostname} ({cluster.id}): {cluster.status} - {len(cluster.worker_nodes)} nodes'
)
# Get clusters with specific status
running_clusters = verda.clusters.get(status=verda.constants.cluster_status.RUNNING)
print(f'\nFound {len(running_clusters)} running cluster(s)')
return clusters
def get_cluster_by_id_example(cluster_id: str):
"""Get a specific cluster by ID."""
cluster = verda.clusters.get_by_id(cluster_id)
print('\nCluster details:')
print(f' ID: {cluster.id}')
print(f' Name: {cluster.hostname}')
print(f' Description: {cluster.description}')
print(f' Status: {cluster.status}')
print(f' Cluster type: {cluster.cluster_type}')
print(f' Created at: {cluster.created_at}')
print(f' Public IP: {cluster.ip}')
print(f' Worker nodes: {len(cluster.worker_nodes)}')
return cluster
def delete_cluster_example(cluster_id: str):
"""Delete a cluster."""
print(f'\nDeleting cluster {cluster_id}...')
verda.clusters.action(cluster_id, Actions.DELETE)
print('Cluster deleted successfully')
def main():
"""Run all cluster examples."""
print('=== Clusters API Example ===\n')
# Create a new cluster
print('1. Creating a new cluster...')
cluster = create_cluster_example()
cluster_id = cluster.id
# List all clusters
print('\n2. Listing all clusters...')
list_clusters_example()
# Get cluster by ID
print('\n3. Getting cluster details...')
get_cluster_by_id_example(cluster_id)
# Delete the cluster
print('\n6. Deleting the cluster...')
delete_cluster_example(cluster_id)
print('\n=== Example completed successfully ===')
if __name__ == '__main__':
main()