Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 141 additions & 0 deletions examples/clusters_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
"""
Example demonstrating how to use the Clusters API.

This example shows how to:
- Create a new compute cluster
- List all clusters
- Get a specific cluster by ID
- Get cluster nodes
- Delete a cluster
"""

import os
import time

from verda import VerdaClient
from verda.constants import Actions, Locations

# Get credentials from environment variables
CLIENT_ID = os.environ.get('VERDA_CLIENT_ID')
CLIENT_SECRET = os.environ.get('VERDA_CLIENT_SECRET')
BASE_URL = os.environ.get('VERDA_BASE_URL', 'https://api.verda.com/v1')

# Create client
verda = VerdaClient(CLIENT_ID, CLIENT_SECRET, base_url=BASE_URL)


def create_cluster_example():
"""Create a new compute cluster."""
# Get SSH keys
ssh_keys = [key.id for key in verda.ssh_keys.get()]

# Check if cluster type is available
if not verda.clusters.is_available('16B200', Locations.FIN_03):
raise ValueError('Cluster type 16B200 is not available in FIN_03')

# Get available images for cluster type
images = verda.clusters.get_cluster_images('16B200')
if 'ubuntu-22.04-cuda-12.9-cluster' not in images:
raise ValueError('Ubuntu 22.04 CUDA 12.9 cluster image is not supported for 16B200')

# Create a 16B200 cluster
cluster = verda.clusters.create(
hostname='my-compute-cluster',
cluster_type='16B200',
image='ubuntu-22.04-cuda-12.9-cluster',
description='Example compute cluster for distributed training',
ssh_key_ids=ssh_keys,
location=Locations.FIN_03,
shared_volume_name='my-shared-volume',
shared_volume_size=30000,
wait_for_status=None,
)

print(f'Creating cluster: {cluster.id}')
print(f'Cluster hostname: {cluster.hostname}')
print(f'Cluster status: {cluster.status}')
print(f'Cluster cluster_type: {cluster.cluster_type}')
print(f'Location: {cluster.location}')

# Wait for cluster to enter RUNNING status
while cluster.status != verda.constants.cluster_status.RUNNING:
time.sleep(2)
print(f'Waiting for cluster to enter RUNNING status... (status: {cluster.status})')
cluster = verda.clusters.get_by_id(cluster.id)

print(f'Public IP: {cluster.ip}')
print('Cluster is now running and ready to use!')

return cluster


def list_clusters_example():
"""List all clusters."""
# Get all clusters
clusters = verda.clusters.get()

print(f'\nFound {len(clusters)} cluster(s):')
for cluster in clusters:
print(
f' - {cluster.hostname} ({cluster.id}): {cluster.status} - {len(cluster.worker_nodes)} nodes'
)

# Get clusters with specific status
running_clusters = verda.clusters.get(status=verda.constants.cluster_status.RUNNING)
print(f'\nFound {len(running_clusters)} running cluster(s)')

return clusters


def get_cluster_by_id_example(cluster_id: str):
"""Get a specific cluster by ID."""
cluster = verda.clusters.get_by_id(cluster_id)

print('\nCluster details:')
print(f' ID: {cluster.id}')
print(f' Name: {cluster.hostname}')
print(f' Description: {cluster.description}')
print(f' Status: {cluster.status}')
print(f' Cluster type: {cluster.cluster_type}')
print(f' Created at: {cluster.created_at}')
print(f' Public IP: {cluster.ip}')
print(f' Worker nodes: {len(cluster.worker_nodes)}')

return cluster


def delete_cluster_example(cluster_id: str):
"""Delete a cluster."""
print(f'\nDeleting cluster {cluster_id}...')

verda.clusters.action(cluster_id, Actions.DELETE)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe nicer to use the delete method (or both)


print('Cluster deleted successfully')


def main():
"""Run all cluster examples."""
print('=== Clusters API Example ===\n')

# Create a new cluster
print('1. Creating a new cluster...')
cluster = create_cluster_example()
cluster_id = cluster.id

# List all clusters
print('\n2. Listing all clusters...')
list_clusters_example()

# Get cluster by ID
print('\n3. Getting cluster details...')
get_cluster_by_id_example(cluster_id)

# Delete the cluster
print('\n6. Deleting the cluster...')
delete_cluster_example(cluster_id)

print('\n=== Example completed successfully ===')


if __name__ == '__main__':
main()
3 changes: 1 addition & 2 deletions tests/integration_tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,11 @@
Make sure to run the server and the account has enough balance before running the tests
"""

BASE_URL = 'http://localhost:3010/v1'

# Load env variables, make sure there's an env file with valid client credentials
load_dotenv()
CLIENT_SECRET = os.getenv('VERDA_CLIENT_SECRET')
CLIENT_ID = os.getenv('VERDA_CLIENT_ID')
BASE_URL = os.getenv('VERDA_BASE_URL', 'http://localhost:3010/v1')


@pytest.fixture
Expand Down
69 changes: 69 additions & 0 deletions tests/integration_tests/test_clusters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import logging
import os

import pytest

from verda import VerdaClient
from verda.constants import Locations

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger()


IN_GITHUB_ACTIONS = os.getenv('GITHUB_ACTIONS') == 'true'


@pytest.mark.skipif(IN_GITHUB_ACTIONS, reason="Test doesn't work in Github Actions.")
@pytest.mark.withoutresponses
class TestClusters:
def test_create_cluster(self, verda_client: VerdaClient):
# get ssh key
ssh_key = verda_client.ssh_keys.get()[0]

if not verda_client.clusters.is_available('16B200', Locations.FIN_03):
raise ValueError('Cluster type 16B200 is not available in FIN_03')
logger.debug('[x] Cluster type 16B200 is available in FIN_03')

availabilities = verda_client.clusters.get_availabilities(Locations.FIN_03)
assert len(availabilities) > 0
assert '16B200' in availabilities
logger.debug(
'[x] Cluster type 16B200 is one of the available cluster types in FIN_03: %s',
availabilities,
)

images = verda_client.clusters.get_cluster_images('16B200')
assert len(images) > 0
assert 'ubuntu-22.04-cuda-12.9-cluster' in images
logger.debug('[x] Ubuntu 22.04 CUDA 12.9 cluster image is supported for 16B200')

# create instance
cluster = verda_client.clusters.create(
hostname='test-instance',
location=Locations.FIN_03,
cluster_type='16B200',
description='test instance',
image='ubuntu-22.04-cuda-12.9-cluster',
ssh_key_ids=[ssh_key.id],
# Set to None to not wait for provisioning but return immediately
wait_for_status=verda_client.constants.cluster_status.PROVISIONING,
)

# assert instance is created
assert cluster.id is not None
assert (
cluster.status == verda_client.constants.cluster_status.PROVISIONING
or cluster.status == verda_client.constants.cluster_status.RUNNING
)

# If still provisioning, we don't have worker nodes yet and ip is not available
if cluster.status != verda_client.constants.instance_status.PROVISIONING:
assert cluster.worker_nodes is not None
assert len(cluster.worker_nodes) == 2
assert cluster.ip is not None

# Now we need to wait for RUNNING status to connect to the jumphost (public IP is available)
# After that, we can connect to the jumphost and run commands on the cluster nodes:
#
# ssh -i ssh_key.pem root@<public_ip>
#
Empty file.
Loading