From 4a58718dbbb6e46926dd9e284e86515cc11a5621 Mon Sep 17 00:00:00 2001 From: Dominik Rabij Date: Thu, 5 Feb 2026 19:15:57 +0100 Subject: [PATCH 01/27] Implement better reservation handling logic and capacity calculations --- recipes/Cluster_create_RayCluster.md | 2 + recipes/Cluster_create_private.md | 2 + recipes/Cluster_create_sub-slicing.md | 2 + recipes/Cluster_create_super-slicing.md | 20 +- recipes/Cluster_create_with_gb200-4.md | 2 + .../Cluster_create_with_shared_reservation.md | 4 +- src/xpk/core/capacity.py | 280 ++++++++++++++++ src/xpk/core/capacity_test.py | 313 ++++++++++++++++-- src/xpk/core/nodepool.py | 66 ++-- src/xpk/core/nodepool_test.py | 221 +++++++++++-- 10 files changed, 829 insertions(+), 83 deletions(-) diff --git a/recipes/Cluster_create_RayCluster.md b/recipes/Cluster_create_RayCluster.md index df227ac0c..230c16943 100644 --- a/recipes/Cluster_create_RayCluster.md +++ b/recipes/Cluster_create_RayCluster.md @@ -59,6 +59,8 @@ gcloud beta container node-pools describe 0 --cluster golden-cluster --project=g [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true [XPK] Existing node pool names ['0'] +[XPK] Task: `Get reservation count for golden-reservation` is implemented by the following command not running since it is a dry run. +gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a --format="csv[no-heading](specificReservation.count,specificReservation.inUseCount,status)" [XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=golden-reservation --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" [XPK] Breaking up a total of 1 commands into 1 batches [XPK] Pretending all the jobs succeeded diff --git a/recipes/Cluster_create_private.md b/recipes/Cluster_create_private.md index 3c67d9539..428ca9aff 100644 --- a/recipes/Cluster_create_private.md +++ b/recipes/Cluster_create_private.md @@ -63,6 +63,8 @@ gcloud beta container node-pools describe 0 --cluster golden-cluster-private --p [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. kubectl get configmap golden-cluster-private-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true [XPK] Existing node pool names ['0'] +[XPK] Task: `Get reservation count for golden-reservation` is implemented by the following command not running since it is a dry run. +gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a --format="csv[no-heading](specificReservation.count,specificReservation.inUseCount,status)" [XPK] To complete NodepoolCreate-golden-cluster-private-np-0 we are executing gcloud beta container node-pools create golden-cluster-private-np-0 --location=us-central1 --cluster=golden-cluster-private --project=golden-project --node-locations=us-central1-a --machine-type=ct5p-hightpu-4t --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=golden-reservation --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" [XPK] To complete NodepoolCreate-cpu-np we are executing gcloud beta container node-pools create cpu-np --node-version=0 --cluster=golden-cluster-private --project=golden-project --node-locations=us-central1-a --location=us-central1 --num-nodes=1 --machine-type=n2-standard-64 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --enable-autoscaling --min-nodes=1 --max-nodes=20 [XPK] Breaking up a total of 2 commands into 1 batches diff --git a/recipes/Cluster_create_sub-slicing.md b/recipes/Cluster_create_sub-slicing.md index b03f037e1..b374f4162 100644 --- a/recipes/Cluster_create_sub-slicing.md +++ b/recipes/Cluster_create_sub-slicing.md @@ -61,6 +61,8 @@ gcloud beta container node-pools describe 0 --cluster golden-cluster --project=g [XPK] Task: `GKE Cluster Get ConfigMap` is implemented by the following command not running since it is a dry run. kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true [XPK] Existing node pool names ['0'] +[XPK] Task: `Get reservation count for golden-reservation` is implemented by the following command not running since it is a dry run. +gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a --format="csv[no-heading](specificReservation.count,specificReservation.inUseCount,status)" [XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=ct6e-standard-4t --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=golden-reservation --enable-gvnic --accelerator-network-profile=auto --node-labels=cloud.google.com/gke-networking-dra-driver=true --node-version=0 --num-nodes=4 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --placement-type=COMPACT --tpu-topology=4x4 --max-pods-per-node 15 [XPK] Breaking up a total of 1 commands into 1 batches [XPK] Pretending all the jobs succeeded diff --git a/recipes/Cluster_create_super-slicing.md b/recipes/Cluster_create_super-slicing.md index 03ca858bd..2374a61c0 100644 --- a/recipes/Cluster_create_super-slicing.md +++ b/recipes/Cluster_create_super-slicing.md @@ -3,10 +3,10 @@ Creates a GKE cluster with TPU super-slicing enabled for multi-slice training. # Running the command ```shell #golden -SUPER_SLICING_ENABLED=true xpk cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --tpu-type=tpu7x-4x4x4 --reservation=golden-reservation/reservationBlocks/block/reservationSubBlocks/subblock --super-slicing --num-cubes=5 +SUPER_SLICING_ENABLED=true DRY_RUN_RESERVATION_SUB_BLOCKS="golden-reservation/reservationBlocks/block=sub0,sub1,sub2,sub3,sub4" xpk cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --tpu-type=tpu7x-4x4x4 --reservation=golden-reservation/reservationBlocks/block --super-slicing --num-cubes=5 ``` diff --git a/recipes/Cluster_create_with_gb200-4.md b/recipes/Cluster_create_with_gb200-4.md index 300bd4fa4..3c844744f 100644 --- a/recipes/Cluster_create_with_gb200-4.md +++ b/recipes/Cluster_create_with_gb200-4.md @@ -62,7 +62,7 @@ kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="Conf [XPK] Task: `Retrieve resource policy` is implemented by the following command not running since it is a dry run. gcloud beta compute resource-policies describe gb200-4-1x72-placement-policy --project=golden-project --region=us-central1 [XPK] Task: `Get reservation count for golden-reservation` is implemented by the following command not running since it is a dry run. -gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a --format="csv(specificReservation.count:label=count,specificReservation.inUseCount:label=inUseCount,status)" +gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a --format="csv(specificReservation.count:label=count,specificReservation.inUseCount:label=in_use_count,status)" [XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=a4x-highgpu-4g --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=golden-reservation --placement-policy=gb200-4-1x72-placement-policy --enable-gvnic --accelerator-network-profile=auto --node-labels=cloud.google.com/gke-networking-dra-driver=true --num-nodes=2 --accelerator type=nvidia-gb200,count=4,gpu-driver-version=latest --scopes="https://www.googleapis.com/auth/cloud-platform" [XPK] Breaking up a total of 1 commands into 1 batches [XPK] Pretending all the jobs succeeded diff --git a/recipes/Cluster_create_with_shared_reservation.md b/recipes/Cluster_create_with_shared_reservation.md index b0462e18b..d521d367c 100644 --- a/recipes/Cluster_create_with_shared_reservation.md +++ b/recipes/Cluster_create_with_shared_reservation.md @@ -60,7 +60,7 @@ gcloud beta container node-pools describe 0 --cluster golden-cluster --project=g kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true [XPK] Existing node pool names ['0'] [XPK] Task: `Get reservation count for golden-reservation` is implemented by the following command not running since it is a dry run. -gcloud beta compute reservations describe golden-reservation --project=reservation-project --zone=us-central1-a --format="csv(specificReservation.count:label=count,specificReservation.inUseCount:label=inUseCount,status)" +gcloud beta compute reservations describe golden-reservation --project=reservation-project --zone=us-central1-a --format="csv(specificReservation.count:label=count,specificReservation.inUseCount:label=in_use_count,status)" [XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=projects/reservation-project/reservations/golden-reservation --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" [XPK] Breaking up a total of 1 commands into 1 batches [XPK] Pretending all the jobs succeeded diff --git a/src/xpk/core/capacity.py b/src/xpk/core/capacity.py index d4b552f7c..1b0c9d959 100644 --- a/src/xpk/core/capacity.py +++ b/src/xpk/core/capacity.py @@ -495,12 +495,12 @@ def _get_available_slices_in_sub_block( command = ( 'gcloud beta compute reservations sub-blocks list' f' {reservation.name} --block-name={reservation.block_name} --project={reservation.project} --zone={reservation.zone} --filter="name={reservation.sub_block_name} AND' - ' healthInfo.healthStatus=HEALTHY" --format="csv(count,inUseCount)"' + ' healthInfo.healthStatus=HEALTHY" --format="csv(count,in_use_count)"' ) return_code, output = run_command_for_value( command, f'Check sub-block {reservation.sub_block_name} health', - dry_run_return_val='count,inUseCount\n16,0', + dry_run_return_val='count,in_use_count\n16,0', ) if return_code != 0: return 0, return_code @@ -513,7 +513,7 @@ def _get_available_slices_in_sub_block( try: row = rows[0] count = int(row['count']) - in_use_count = int(row['inUseCount']) + in_use_count = int(row['in_use_count']) available_slices = (count - in_use_count) // required_hosts return available_slices, 0 except ValueError: @@ -569,7 +569,7 @@ def _get_healthy_and_fitting_sub_blocks_in_block( f'--project={reservation.project} ' f'--zone={reservation.zone} ' '--filter="healthInfo.healthStatus=HEALTHY" ' - '--format="csv(name,count,inUseCount)"' + '--format="csv(name,count,in_use_count)"' ) return_code, output = run_command_for_value( command, @@ -584,7 +584,7 @@ def _get_healthy_and_fitting_sub_blocks_in_block( for row in rows: sub_block_name = row['name'] count = int(row['count']) - in_use_count = int(row['inUseCount']) + in_use_count = int(row['in_use_count']) available_slots = (count - in_use_count) // required_hosts if available_slots > 0: @@ -654,13 +654,13 @@ def _get_reservation_count( f'gcloud beta compute reservations describe {reservation.name} ' f'--project={reservation.project} ' f'--zone={reservation.zone} ' - '--format="csv(specificReservation.count:label=count,specificReservation.inUseCount:label=inUseCount,status)"' + '--format="csv(specificReservation.count:label=count,specificReservation.inUseCount:label=in_use_count,status)"' ) return_code, output = run_command_for_value( command, f'Get reservation count for {reservation.name}', - dry_run_return_val='count,inUseCount,status\n16,0,READY', + dry_run_return_val='count,in_use_count,status\n16,0,READY', ) if return_code != 0: return 0, return_code @@ -670,7 +670,7 @@ def _get_reservation_count( try: row = rows[0] if row['status'] == 'READY': - available_hosts = max(0, int(row['count']) - int(row['inUseCount'])) + available_hosts = max(0, int(row['count']) - int(row['in_use_count'])) return available_hosts // required_hosts, 0 except (ValueError, IndexError): pass diff --git a/src/xpk/core/capacity_test.py b/src/xpk/core/capacity_test.py index 6f6fd1ebb..1b8afb344 100644 --- a/src/xpk/core/capacity_test.py +++ b/src/xpk/core/capacity_test.py @@ -303,7 +303,7 @@ def test_assess_available_slices_sub_block_healthy( commands_tester: CommandsTester, ): commands_tester.set_result_for_command( - (0, 'count,inUseCount\n1,0'), + (0, 'count,in_use_count\n1,0'), 'gcloud beta compute reservations sub-blocks list', ) res = SubBlockReservationLink( @@ -346,7 +346,7 @@ def test_assess_available_slices_sub_block_unhealthy( def test_assess_available_slices_block_healthy(commands_tester: CommandsTester): # Mock 2 healthy sub-blocks commands_tester.set_result_for_command( - (0, 'name,count,inUseCount\nsub1,1,0\nsub2,1,0'), + (0, 'name,count,in_use_count\nsub1,1,0\nsub2,1,0'), 'gcloud beta compute reservations sub-blocks list', ) res = BlockReservationLink( @@ -413,7 +413,7 @@ def test_assess_available_slices_link_with_blocks( (0, 'block1'), 'gcloud beta compute reservations blocks list' ) commands_tester.set_result_for_command( - (0, 'name,count,inUseCount\nsub1,1,0'), + (0, 'name,count,in_use_count\nsub1,1,0'), 'gcloud beta compute reservations sub-blocks list', '--block-name=block1', ) @@ -446,7 +446,7 @@ def test_assess_available_slices_link_without_blocks( ) # Mock getting count commands_tester.set_result_for_command( - (0, 'count,inUseCount,status\n2,0,READY'), + (0, 'count,in_use_count,status\n2,0,READY'), 'gcloud beta compute reservations describe', ) @@ -471,7 +471,7 @@ def test_assess_available_slices_link_without_blocks_sub_block_targeting( ) # Mock getting count commands_tester.set_result_for_command( - (0, 'count,inUseCount,status\n2,0,READY'), + (0, 'count,in_use_count,status\n2,0,READY'), 'gcloud beta compute reservations describe', ) @@ -488,7 +488,7 @@ def test_assess_available_slices_host_filtering_insufficient_hosts( ): # Mock a sub-block that has 14 free hosts but we need 16 commands_tester.set_result_for_command( - (0, 'count,inUseCount\n16,2'), + (0, 'count,in_use_count\n16,2'), 'gcloud beta compute reservations sub-blocks list', ) res = SubBlockReservationLink( @@ -512,7 +512,7 @@ def test_assess_available_slices_host_filtering_sufficient_hosts( ): # Mock a reservation that has 46 free hosts, and we need 16 per slice. commands_tester.set_result_for_command( - (0, 'count,inUseCount,status\n48,2,READY'), + (0, 'count,in_use_count,status\n48,2,READY'), 'gcloud beta compute reservations describe', ) res_link = ReservationLink(project='p', name='r', zone='z') @@ -612,7 +612,7 @@ def test_assess_available_slices_mixed_reservations_with_subblock_targeting( project='project', name='res1', zone='zone', block_name='block1' ) commands_tester.set_result_for_command( - (0, 'name,count,inUseCount\nsub1,1,0\nsub2,1,0'), + (0, 'name,count,in_use_count\nsub1,1,0\nsub2,1,0'), 'gcloud beta compute reservations sub-blocks list res1', '--block-name=block1', ) @@ -626,7 +626,7 @@ def test_assess_available_slices_mixed_reservations_with_subblock_targeting( sub_block_name='sub3', ) commands_tester.set_result_for_command( - (0, 'count,inUseCount\n1,0'), + (0, 'count,in_use_count\n1,0'), 'gcloud beta compute reservations sub-blocks list res2', '--filter="name=sub3 AND healthInfo.healthStatus=HEALTHY"', ) @@ -686,7 +686,7 @@ def test_assess_available_slices_deduplicates(commands_tester: CommandsTester): ) sub_block_name = 'sub1' commands_tester.set_result_for_command( - (0, f'name,count,inUseCount\n{sub_block_name},1,0'), + (0, f'name,count,in_use_count\n{sub_block_name},1,0'), 'gcloud beta compute reservations sub-blocks list res1', '--block-name=block1', ) @@ -698,7 +698,7 @@ def test_assess_available_slices_deduplicates(commands_tester: CommandsTester): sub_block_name=sub_block_name, ) commands_tester.set_result_for_command( - (0, 'count,inUseCount\n1,0'), + (0, 'count,in_use_count\n1,0'), 'gcloud beta compute reservations sub-blocks list res1', '--block-name=block1', f'--filter="name={sub_block_name}', diff --git a/src/xpk/core/nodepool_test.py b/src/xpk/core/nodepool_test.py index 4f5298fb6..45b241e6e 100644 --- a/src/xpk/core/nodepool_test.py +++ b/src/xpk/core/nodepool_test.py @@ -479,7 +479,7 @@ def test_run_gke_node_pool_create_command_multiple_reservations( commands_tester.set_result_for_command( ( 0, - "count,inUseCount,status\n2,0,READY", + "count,in_use_count,status\n2,0,READY", ), "gcloud beta compute reservations describe", ) @@ -556,7 +556,7 @@ def test_run_gke_node_pool_create_command_partial_reservations( commands_tester.set_result_for_command( ( 0, - "count,inUseCount,status\n2,0,READY", + "count,in_use_count,status\n2,0,READY", ), "gcloud beta compute reservations describe", ) @@ -727,7 +727,7 @@ def test_run_gke_node_pool_create_command_super_slicing_exhaustion( commands_tester.set_result_for_command( ( 0, - "name,count,inUseCount\nsub-block1,2,0\nsub-block2,2,0", + "name,count,in_use_count\nsub-block1,2,0\nsub-block2,2,0", ), "gcloud beta compute reservations sub-blocks list", ) @@ -800,7 +800,7 @@ def test_run_gke_node_pool_create_command_super_slicing_insufficient_capacity( commands_tester.set_result_for_command( ( 0, - "name,count,inUseCount\nsub-block1,2,0", + "name,count,in_use_count\nsub-block1,2,0", ), "gcloud beta compute reservations sub-blocks list", ) From 2de1831865ab0e727214b3f72bdbba4496cd4338 Mon Sep 17 00:00:00 2001 From: Dominik Rabij Date: Tue, 10 Feb 2026 19:09:59 +0100 Subject: [PATCH 20/27] _get_reservation_count aggregateReservation fix --- recipes/Cluster_create_RayCluster.md | 2 +- recipes/Cluster_create_private.md | 2 +- recipes/Cluster_create_sub-slicing.md | 2 +- recipes/Cluster_create_super-slicing.md | 294 +++++++++++++++--- recipes/Cluster_create_with_gb200-4.md | 2 +- .../Cluster_create_with_shared_reservation.md | 2 +- src/xpk/core/capacity.py | 64 +++- src/xpk/core/capacity_test.py | 75 ++++- 8 files changed, 376 insertions(+), 67 deletions(-) diff --git a/recipes/Cluster_create_RayCluster.md b/recipes/Cluster_create_RayCluster.md index 94e08e3e5..b280460b8 100644 --- a/recipes/Cluster_create_RayCluster.md +++ b/recipes/Cluster_create_RayCluster.md @@ -60,7 +60,7 @@ gcloud beta container node-pools describe 0 --cluster golden-cluster --project=g kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true [XPK] Existing node pool names ['0'] [XPK] Task: `Get reservation count for golden-reservation` is implemented by the following command not running since it is a dry run. -gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a --format="csv(specificReservation.count:label=count,specificReservation.inUseCount:label=in_use_count,status)" +gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a --format="json(specificReservation,aggregateReservation,status)" [XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=golden-reservation --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" [XPK] Breaking up a total of 1 commands into 1 batches [XPK] Pretending all the jobs succeeded diff --git a/recipes/Cluster_create_private.md b/recipes/Cluster_create_private.md index ff014d61f..ca9663405 100644 --- a/recipes/Cluster_create_private.md +++ b/recipes/Cluster_create_private.md @@ -64,7 +64,7 @@ gcloud beta container node-pools describe 0 --cluster golden-cluster-private --p kubectl get configmap golden-cluster-private-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true [XPK] Existing node pool names ['0'] [XPK] Task: `Get reservation count for golden-reservation` is implemented by the following command not running since it is a dry run. -gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a --format="csv(specificReservation.count:label=count,specificReservation.inUseCount:label=in_use_count,status)" +gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a --format="json(specificReservation,aggregateReservation,status)" [XPK] To complete NodepoolCreate-golden-cluster-private-np-0 we are executing gcloud beta container node-pools create golden-cluster-private-np-0 --location=us-central1 --cluster=golden-cluster-private --project=golden-project --node-locations=us-central1-a --machine-type=ct5p-hightpu-4t --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=golden-reservation --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" [XPK] To complete NodepoolCreate-cpu-np we are executing gcloud beta container node-pools create cpu-np --node-version=0 --cluster=golden-cluster-private --project=golden-project --node-locations=us-central1-a --location=us-central1 --num-nodes=1 --machine-type=n2-standard-64 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --enable-autoscaling --min-nodes=1 --max-nodes=20 [XPK] Breaking up a total of 2 commands into 1 batches diff --git a/recipes/Cluster_create_sub-slicing.md b/recipes/Cluster_create_sub-slicing.md index 22ca7af0b..0d6e69bde 100644 --- a/recipes/Cluster_create_sub-slicing.md +++ b/recipes/Cluster_create_sub-slicing.md @@ -62,7 +62,7 @@ gcloud beta container node-pools describe 0 --cluster golden-cluster --project=g kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true [XPK] Existing node pool names ['0'] [XPK] Task: `Get reservation count for golden-reservation` is implemented by the following command not running since it is a dry run. -gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a --format="csv(specificReservation.count:label=count,specificReservation.inUseCount:label=in_use_count,status)" +gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a --format="json(specificReservation,aggregateReservation,status)" [XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=ct6e-standard-4t --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=golden-reservation --enable-gvnic --accelerator-network-profile=auto --node-labels=cloud.google.com/gke-networking-dra-driver=true --node-version=0 --num-nodes=4 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --placement-type=COMPACT --tpu-topology=4x4 --max-pods-per-node 15 [XPK] Breaking up a total of 1 commands into 1 batches [XPK] Pretending all the jobs succeeded diff --git a/recipes/Cluster_create_super-slicing.md b/recipes/Cluster_create_super-slicing.md index af25066d5..75ff7bee7 100644 --- a/recipes/Cluster_create_super-slicing.md +++ b/recipes/Cluster_create_super-slicing.md @@ -5,10 +5,10 @@ Creates a GKE cluster with TPU super-slicing enabled for multi-slice training. # Running the command ```shell #golden -DRY_RUN_RESERVATION_SUB_BLOCKS="name,count,inUseCount:sub0,16,0:sub1,16,0:sub2,16,15:sub3,16,0" xpk cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --tpu-type=tpu7x-4x4x4 --reservation=golden-reservation/reservationBlocks/block --super-slicing --num-cubes=3 +DRY_RUN_RESERVATION_SUB_BLOCKS="name,count,in_use_count:sub0,16,0:sub1,16,0:sub2,16,15:sub3,16,0" xpk cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --tpu-type=tpu7x-4x4x4 --reservation=golden-reservation/reservationBlocks/block --super-slicing --num-cubes=3 ``` diff --git a/recipes/Cluster_create_with_gb200-4.md b/recipes/Cluster_create_with_gb200-4.md index 3c844744f..a7dd1fee3 100644 --- a/recipes/Cluster_create_with_gb200-4.md +++ b/recipes/Cluster_create_with_gb200-4.md @@ -62,7 +62,7 @@ kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="Conf [XPK] Task: `Retrieve resource policy` is implemented by the following command not running since it is a dry run. gcloud beta compute resource-policies describe gb200-4-1x72-placement-policy --project=golden-project --region=us-central1 [XPK] Task: `Get reservation count for golden-reservation` is implemented by the following command not running since it is a dry run. -gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a --format="csv(specificReservation.count:label=count,specificReservation.inUseCount:label=in_use_count,status)" +gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a --format="json(specificReservation,aggregateReservation,status)" [XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=a4x-highgpu-4g --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=golden-reservation --placement-policy=gb200-4-1x72-placement-policy --enable-gvnic --accelerator-network-profile=auto --node-labels=cloud.google.com/gke-networking-dra-driver=true --num-nodes=2 --accelerator type=nvidia-gb200,count=4,gpu-driver-version=latest --scopes="https://www.googleapis.com/auth/cloud-platform" [XPK] Breaking up a total of 1 commands into 1 batches [XPK] Pretending all the jobs succeeded diff --git a/recipes/Cluster_create_with_shared_reservation.md b/recipes/Cluster_create_with_shared_reservation.md index d521d367c..413d92d59 100644 --- a/recipes/Cluster_create_with_shared_reservation.md +++ b/recipes/Cluster_create_with_shared_reservation.md @@ -60,7 +60,7 @@ gcloud beta container node-pools describe 0 --cluster golden-cluster --project=g kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="ConfigData:data" --no-headers=true [XPK] Existing node pool names ['0'] [XPK] Task: `Get reservation count for golden-reservation` is implemented by the following command not running since it is a dry run. -gcloud beta compute reservations describe golden-reservation --project=reservation-project --zone=us-central1-a --format="csv(specificReservation.count:label=count,specificReservation.inUseCount:label=in_use_count,status)" +gcloud beta compute reservations describe golden-reservation --project=reservation-project --zone=us-central1-a --format="json(specificReservation,aggregateReservation,status)" [XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=projects/reservation-project/reservations/golden-reservation --enable-gvnic --node-version=0 --num-nodes=1 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" [XPK] Breaking up a total of 1 commands into 1 batches [XPK] Pretending all the jobs succeeded diff --git a/src/xpk/core/capacity.py b/src/xpk/core/capacity.py index 1b0c9d959..774deb2f6 100644 --- a/src/xpk/core/capacity.py +++ b/src/xpk/core/capacity.py @@ -16,6 +16,7 @@ import enum import os +import json from dataclasses import dataclass from typing import Sequence @@ -654,24 +655,67 @@ def _get_reservation_count( f'gcloud beta compute reservations describe {reservation.name} ' f'--project={reservation.project} ' f'--zone={reservation.zone} ' - '--format="csv(specificReservation.count:label=count,specificReservation.inUseCount:label=in_use_count,status)"' + '--format="json(specificReservation,aggregateReservation,status)"' ) return_code, output = run_command_for_value( command, f'Get reservation count for {reservation.name}', - dry_run_return_val='count,in_use_count,status\n16,0,READY', + dry_run_return_val=( + '{"specificReservation": {"count": 16, "inUseCount": 0},' + ' "status": "READY"}' + ), ) if return_code != 0: return 0, return_code - rows = _parse_csv_output(output) + try: + data = json.loads(output) + except json.JSONDecodeError: + xpk_print(f'Error: Unrecognized output format: "{output}".') + return 0, 1 + + if data.get('status') != 'READY': + return 0, 0 try: - row = rows[0] - if row['status'] == 'READY': - available_hosts = max(0, int(row['count']) - int(row['in_use_count'])) - return available_hosts // required_hosts, 0 - except (ValueError, IndexError): - pass - return 0, 0 + count = 0 + in_use_count = 0 + + specific_reservation = data.get('specificReservation') + aggregate_reservation = data.get('aggregateReservation') + + if specific_reservation: + count = int(specific_reservation.get('count', 0)) + in_use_count = int(specific_reservation.get('inUseCount', 0)) + elif aggregate_reservation: + reserved_resources = aggregate_reservation.get('reservedResources', []) + # Assuming reservedResources contains only relevant accelerators for the reservation, + # or we pick the first one as the primary type if multiple exist (unlikely for single reservation). + # TODO: We could somehow map the requested machine type to the accelerator type, also filtering for a given project and location. + target_accelerator_type: str | None = None + for resource in reserved_resources: + accelerator = resource.get('accelerator') + if not accelerator: + continue + + if not target_accelerator_type: + target_accelerator_type = accelerator.get('acceleratorType') + if accelerator.get('acceleratorType') == target_accelerator_type: + count += int(accelerator.get('acceleratorCount', 0)) + + if target_accelerator_type: + in_use_resources = aggregate_reservation.get('inUseResources', []) + accelerators = map(lambda r: r.get('accelerator'), in_use_resources) + in_use_count = sum( + accelerator.get('acceleratorCount', 0) + for accelerator in accelerators + if accelerator + and accelerator.get('acceleratorType') == target_accelerator_type + ) + + available_hosts = max(0, count - in_use_count) + return available_hosts // required_hosts, 0 + except (ValueError, IndexError, AttributeError) as e: + xpk_print(f'Error processing reservation data: {e}. Output: "{output}".') + return 0, 1 diff --git a/src/xpk/core/capacity_test.py b/src/xpk/core/capacity_test.py index 1b8afb344..02b20bca3 100644 --- a/src/xpk/core/capacity_test.py +++ b/src/xpk/core/capacity_test.py @@ -446,7 +446,13 @@ def test_assess_available_slices_link_without_blocks( ) # Mock getting count commands_tester.set_result_for_command( - (0, 'count,in_use_count,status\n2,0,READY'), + ( + 0, + ( + '{"specificReservation": {"count": 2, "inUseCount": 0}, "status":' + ' "READY"}' + ), + ), 'gcloud beta compute reservations describe', ) @@ -471,7 +477,13 @@ def test_assess_available_slices_link_without_blocks_sub_block_targeting( ) # Mock getting count commands_tester.set_result_for_command( - (0, 'count,in_use_count,status\n2,0,READY'), + ( + 0, + ( + '{"specificReservation": {"count": 2, "inUseCount": 0}, "status":' + ' "READY"}' + ), + ), 'gcloud beta compute reservations describe', ) @@ -512,7 +524,13 @@ def test_assess_available_slices_host_filtering_sufficient_hosts( ): # Mock a reservation that has 46 free hosts, and we need 16 per slice. commands_tester.set_result_for_command( - (0, 'count,in_use_count,status\n48,2,READY'), + ( + 0, + ( + '{"specificReservation": {"count": 48, "inUseCount": 2},' + ' "status": "READY"}' + ), + ), 'gcloud beta compute reservations describe', ) res_link = ReservationLink(project='p', name='r', zone='z') @@ -529,6 +547,57 @@ def test_assess_available_slices_host_filtering_sufficient_hosts( ] +def test_assess_available_slices_aggregate_reservation( + commands_tester: CommandsTester, +): + json_output = """ + { + "aggregateReservation": { + "reservedResources": [ + { + "accelerator": { + "acceleratorType": "accelerator-1", + "acceleratorCount": 100 + } + } + ], + "inUseResources": [ + { + "accelerator": { + "acceleratorType": "accelerator-1", + "acceleratorCount": 20 + } + }, + { + "accelerator": { + "acceleratorType": "accelerator-2", + "acceleratorCount": 50 + } + } + ] + }, + "status": "READY" + } + """ + commands_tester.set_result_for_command( + (0, json_output), + 'gcloud beta compute reservations describe', + ) + res = ReservationLink(project='project', name='reservation', zone='zone') + + slices, return_code = assess_available_slices( + [res], force_sub_block_targeting=False, required_hosts=1 + ) + + assert return_code == 0 + assert slices == [ + ReservationCapacity( + ReservationLink(project='project', name='reservation', zone='zone'), + available_slices=80, + ) + ] + + def test_assess_available_slices_failures_sub_block_check( commands_tester: CommandsTester, ): From dd90dd264142dd11cff69ec584ea83117e8ed4b9 Mon Sep 17 00:00:00 2001 From: Dominik Rabij Date: Wed, 11 Feb 2026 11:02:15 +0100 Subject: [PATCH 21/27] json parsing --- recipes/Cluster_create_super-slicing.md | 6 +- src/xpk/core/capacity.py | 297 ++++++++++++++---------- src/xpk/core/capacity_test.py | 83 ++++++- src/xpk/core/nodepool_test.py | 51 ++-- 4 files changed, 272 insertions(+), 165 deletions(-) diff --git a/recipes/Cluster_create_super-slicing.md b/recipes/Cluster_create_super-slicing.md index 75ff7bee7..e950261dc 100644 --- a/recipes/Cluster_create_super-slicing.md +++ b/recipes/Cluster_create_super-slicing.md @@ -5,10 +5,10 @@ Creates a GKE cluster with TPU super-slicing enabled for multi-slice training. # Running the command ```shell #golden -DRY_RUN_RESERVATION_SUB_BLOCKS="name,count,in_use_count:sub0,16,0:sub1,16,0:sub2,16,15:sub3,16,0" xpk cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --tpu-type=tpu7x-4x4x4 --reservation=golden-reservation/reservationBlocks/block --super-slicing --num-cubes=3 +DRY_RUN_RESERVATION_SUB_BLOCKS='[{"name": "sub0", "count": 16, "inUseCount": 0}, {"name": "sub1", "count": 16, "inUseCount": 0}, {"name": "sub2", "count": 16, "inUseCount": 15}, {"name": "sub3", "count": 16, "inUseCount": 0}]' xpk cluster create --project=golden-project --zone=us-central1-a --cluster=golden-cluster --tpu-type=tpu7x-4x4x4 --reservation=golden-reservation/reservationBlocks/block --super-slicing --num-cubes=3 ```