From 66d55dd55c29dd5d0d7fe94eeefe914a80a53c9c Mon Sep 17 00:00:00 2001 From: Sudhanva Huruli Date: Wed, 2 Jul 2025 00:02:14 +0000 Subject: [PATCH 1/3] Azure Linux support in gpu provisioner --- .../azure-linux-annotation-nodeclaim.yaml | 44 ++ examples/azure-linux-examples.md | 255 +++++++++ examples/azure-linux-nodeclaim.yaml | 37 ++ pkg/providers/instance/instance.go | 34 +- pkg/providers/instance/instance_test.go | 161 ++++++ test/e2e/suites/suite_test.go | 508 ++++++++++++++++++ 6 files changed, 1038 insertions(+), 1 deletion(-) create mode 100644 examples/azure-linux-annotation-nodeclaim.yaml create mode 100644 examples/azure-linux-examples.md create mode 100644 examples/azure-linux-nodeclaim.yaml diff --git a/examples/azure-linux-annotation-nodeclaim.yaml b/examples/azure-linux-annotation-nodeclaim.yaml new file mode 100644 index 00000000..68dbff07 --- /dev/null +++ b/examples/azure-linux-annotation-nodeclaim.yaml @@ -0,0 +1,44 @@ +apiVersion: karpenter.sh/v1 +kind: NodeClaim +metadata: + name: azure-linux-annotation-nodeclaim + labels: + kaito.sh/workspace: "mistral-azure-linux" + annotations: + # Alternative method: using annotation instead of label + kaito.sh/node-image-family: "AzureLinux" + description: "NodeClaim using Azure Linux via annotation" +spec: + nodeClassRef: + apiVersion: karpenter.azure.com/v1alpha1 + kind: AKSNodeClass + name: default + requirements: + # Larger GPU instance for more demanding models + - key: node.kubernetes.io/instance-type + operator: In + values: ["Standard_NC24ads_A100_v4"] + # Allow both spot and on-demand + - key: karpenter.sh/capacity-type + operator: In + values: ["spot", "on-demand"] + # OS requirement + - key: kubernetes.io/os + operator: In + values: ["linux"] + # NodePool label + - key: karpenter.sh/nodepool + operator: In + values: ["kaito"] + resources: + requests: + storage: "200Gi" + taints: + # GPU taint + - key: "sku" + value: "gpu" + effect: NoSchedule + # Additional taint for high-end GPU workloads + - key: "workload-type" + value: "ai-inference" + effect: NoSchedule diff --git a/examples/azure-linux-examples.md b/examples/azure-linux-examples.md new file mode 100644 index 00000000..36ed173e --- /dev/null +++ b/examples/azure-linux-examples.md @@ -0,0 +1,255 @@ +# Azure Linux Support Examples for GPU Provisioner + +This document provides examples of how to use Azure Linux nodes with the GPU Provisioner and KAITO. + +## Overview + +The GPU Provisioner supports Azure Linux nodes through the `kaito.sh/node-image-family` label or annotation on NodeClaim resources. When this is set to `AzureLinux`, the provisioner will create AKS agent pools with the Azure Linux OS SKU. + +## Supported Image Families + +| Image Family | OS SKU | Description | +|-------------|--------|-------------| +| `AzureLinux` | AzureLinux | Container-optimized Linux distribution by Microsoft | +| `Ubuntu` | Ubuntu | Standard Ubuntu distribution (default) | +| `Ubuntu2204` | Ubuntu | Ubuntu 22.04 LTS | + +## Configuration Methods + +### Method 1: Using Labels (Recommended) + +```yaml +apiVersion: karpenter.sh/v1 +kind: NodeClaim +metadata: + name: azure-linux-gpu-node + labels: + kaito.sh/workspace: "my-workspace" + kaito.sh/node-image-family: "AzureLinux" +spec: + nodeClassRef: + apiVersion: karpenter.azure.com/v1alpha1 + kind: AKSNodeClass + name: default + requirements: + - key: node.kubernetes.io/instance-type + operator: In + values: ["Standard_NC12s_v3"] + - key: karpenter.sh/capacity-type + operator: In + values: ["on-demand"] + resources: + requests: + storage: "120Gi" + taints: + - key: "sku" + value: "gpu" + effect: NoSchedule +``` + +### Method 2: Using Annotations + +```yaml +apiVersion: karpenter.sh/v1 +kind: NodeClaim +metadata: + name: azure-linux-gpu-node-annotation + labels: + kaito.sh/workspace: "my-workspace" + annotations: + kaito.sh/node-image-family: "AzureLinux" +spec: + nodeClassRef: + apiVersion: karpenter.azure.com/v1alpha1 + kind: AKSNodeClass + name: default + requirements: + - key: node.kubernetes.io/instance-type + operator: In + values: ["Standard_NC12s_v3"] + - key: karpenter.sh/capacity-type + operator: In + values: ["on-demand"] + resources: + requests: + storage: "120Gi" + taints: + - key: "sku" + value: "gpu" + effect: NoSchedule +``` + +## KAITO Workspace Examples + +### Basic Azure Linux Workspace + +```yaml +apiVersion: kaito.sh/v1beta1 +kind: Workspace +metadata: + name: phi-2-azure-linux +spec: + resource: + instanceType: "Standard_NC12s_v3" + count: 1 + labelSelector: + matchLabels: + kaito.sh/node-image-family: "AzureLinux" + workload: "phi-2" + inference: + preset: + name: "phi-2" +``` + +### Azure Linux Workspace with Annotation + +```yaml +apiVersion: kaito.sh/v1beta1 +kind: Workspace +metadata: + name: falcon-7b-azure-linux + annotations: + kaito.sh/node-image-family: "AzureLinux" +spec: + resource: + instanceType: "Standard_NC24s_v3" + count: 1 + labelSelector: + matchLabels: + workload: "falcon-7b" + inference: + preset: + name: "falcon-7b" +``` + +## Implementation Details + +The GPU Provisioner determines the OS SKU based on the following logic in `instance.go`: + +1. **Check for label first** (takes precedence): + ```go + if imageFamily, ok := nodeClaim.Labels["kaito.sh/node-image-family"]; ok { + switch strings.ToLower(imageFamily) { + case "azurelinux": + ossku = armcontainerservice.OSSKUAzureLinux + case "ubuntu", "ubuntu2204": + ossku = armcontainerservice.OSSKUUbuntu + default: + klog.Warningf("Unknown imageFamily %s, defaulting to Ubuntu", imageFamily) + } + } + ``` + +2. **Fall back to annotation**: + ```go + } else if imageFamily, ok := nodeClaim.Annotations["kaito.sh/node-image-family"]; ok { + // Same logic as above + } + ``` + +3. **Default to Ubuntu** if neither label nor annotation is present + +## Case Sensitivity + +Image family values are **case-insensitive**. All of the following values will work: +- `AzureLinux` +- `azurelinux` +- `AZURELINUX` +- `AzUrElInUx` + +## Benefits of Azure Linux + +1. **Container-optimized**: Designed specifically for containerized workloads +2. **Security**: Enhanced security features and reduced attack surface +3. **Performance**: Optimized for cloud-native applications +4. **Microsoft Support**: Direct support from Microsoft +5. **Compliance**: Built with enterprise security and compliance in mind + +## Validation + +### Check Node OS Image + +```bash +# Check the OS image on your nodes +kubectl get nodes -o custom-columns=NAME:.metadata.name,OS-IMAGE:.status.nodeInfo.osImage + +# Example output for Azure Linux: +# NAME OS-IMAGE +# aks-azlinuxpool-12345678-vmss000000 Azure Linux 2.0.20240101 +``` + +### Check Agent Pool OS SKU + +```bash +# Check agent pool configuration +az aks agentpool show \ + --resource-group \ + --cluster-name \ + --name \ + --query "osSkU" + +# Expected output: "AzureLinux" +``` + +## Troubleshooting + +### Common Issues + +1. **Unknown image family warning**: + ``` + Unknown imageFamily InvalidFamily in NodeClaim label, defaulting to Ubuntu + ``` + **Solution**: Ensure the image family name is one of: `AzureLinux`, `Ubuntu`, `Ubuntu2204` + +2. **Case sensitivity confusion**: + **Solution**: Remember that values are case-insensitive, so `azurelinux` works the same as `AzureLinux` + +3. **Labels vs annotations precedence**: + **Solution**: Labels take precedence over annotations. If both are specified, the label value will be used + +### Debug Commands + +```bash +# Check NodeClaim labels and annotations +kubectl get nodeclaim -o yaml + +# Check GPU Provisioner logs +kubectl logs -n gpu-provisioner deployment/gpu-provisioner + +# Check node labels and OS info +kubectl describe node +``` + +## Migration Guide + +### From Ubuntu to Azure Linux + +1. **Update your NodeClaim or Workspace**: + ```yaml + # Add this label to your NodeClaim or Workspace labelSelector + kaito.sh/node-image-family: "AzureLinux" + ``` + +2. **Verify the change**: + ```bash + # Check that new nodes use Azure Linux + kubectl get nodes -o custom-columns=NAME:.metadata.name,OS-IMAGE:.status.nodeInfo.osImage + ``` + +3. **Test your workloads**: + - Ensure your containerized workloads work correctly on Azure Linux + - Most workloads should work without changes + +## Best Practices + +1. **Use labels instead of annotations** for better visibility and tooling support +2. **Test thoroughly** when migrating existing workloads to Azure Linux +3. **Monitor resource usage** as Azure Linux may have different resource characteristics +4. **Keep GPU drivers updated** to ensure compatibility with Azure Linux +5. **Use specific instance types** that are known to work well with Azure Linux and your GPU workloads + +## Related Links + +- [Azure Linux Documentation](https://docs.microsoft.com/en-us/azure/azure-linux/) +- [KAITO Documentation](https://github.com/kaito-project/kaito) +- [GPU Provisioner Repository](https://github.com/Azure/gpu-provisioner) diff --git a/examples/azure-linux-nodeclaim.yaml b/examples/azure-linux-nodeclaim.yaml new file mode 100644 index 00000000..5a707d50 --- /dev/null +++ b/examples/azure-linux-nodeclaim.yaml @@ -0,0 +1,37 @@ +apiVersion: karpenter.sh/v1 +kind: NodeClaim +metadata: + name: azure-linux-gpu-nodeclaim + labels: + kaito.sh/workspace: "llama-azure-linux" + kaito.sh/node-image-family: "AzureLinux" +spec: + nodeClassRef: + apiVersion: karpenter.azure.com/v1alpha1 + kind: AKSNodeClass + name: default + requirements: + # GPU instance type + - key: node.kubernetes.io/instance-type + operator: In + values: ["Standard_NC12s_v3"] + # Capacity type preference + - key: karpenter.sh/capacity-type + operator: In + values: ["on-demand"] + # OS requirement + - key: kubernetes.io/os + operator: In + values: ["linux"] + # NodePool label + - key: karpenter.sh/nodepool + operator: In + values: ["kaito"] + resources: + requests: + storage: "120Gi" + taints: + # GPU taint to ensure only GPU workloads are scheduled + - key: "sku" + value: "gpu" + effect: NoSchedule diff --git a/pkg/providers/instance/instance.go b/pkg/providers/instance/instance.go index fe6957f2..348a69c4 100644 --- a/pkg/providers/instance/instance.go +++ b/pkg/providers/instance/instance.go @@ -355,13 +355,18 @@ func newAgentPoolObject(vmSize string, nodeClaim *karpenterv1.NodeClaim) (armcon diskSizeGB = int32(storage.Value() >> 30) } + // Determine OSSKU from NodeClaim labels, annotations, or NodeClassRef, default to Ubuntu + // Note: NodeClassRef support could be added in the future if needed, + // but requires importing external dependencies + return armcontainerservice.AgentPool{ Properties: &armcontainerservice.ManagedClusterAgentPoolProfileProperties{ NodeLabels: labels, - NodeTaints: taintsStr, //[]*string{to.Ptr("sku=gpu:NoSchedule")}, + NodeTaints: taintsStr, Type: to.Ptr(scaleSetsType), VMSize: to.Ptr(vmSize), OSType: to.Ptr(armcontainerservice.OSTypeLinux), + OSSKU: determineOSSKU(nodeClaim), Count: to.Ptr(int32(1)), OSDiskSizeGB: to.Ptr(diskSizeGB), }, @@ -411,3 +416,30 @@ func agentPoolIsCreatedFromNodeClaim(ap *armcontainerservice.AgentPool) bool { return false } + +// determineOSSKU determines the OS SKU from NodeClaim labels or annotations, defaulting to Ubuntu +func determineOSSKU(nodeClaim *karpenterv1.NodeClaim) *armcontainerservice.OSSKU { + // Helper function to convert image family to OSSKU + convertImageFamilyToOSSKU := func(imageFamily, source string) *armcontainerservice.OSSKU { + switch strings.ToLower(imageFamily) { + case "azurelinux": + return to.Ptr(armcontainerservice.OSSKUAzureLinux) + case "ubuntu", "ubuntu2204": + return to.Ptr(armcontainerservice.OSSKUUbuntu) + default: + klog.Warningf("Unknown imageFamily %s in NodeClaim %s, defaulting to Ubuntu", imageFamily, source) + return to.Ptr(armcontainerservice.OSSKUUbuntu) + } + } + // First check for a direct label on the NodeClaim + if imageFamily, ok := nodeClaim.Labels["kaito.sh/node-image-family"]; ok { + return convertImageFamilyToOSSKU(imageFamily, "label") + } + // Check annotations as fallback + if imageFamily, ok := nodeClaim.Annotations["kaito.sh/node-image-family"]; ok { + return convertImageFamilyToOSSKU(imageFamily, "annotation") + } + + // Default to Ubuntu if no image family is specified + return to.Ptr(armcontainerservice.OSSKUUbuntu) +} diff --git a/pkg/providers/instance/instance_test.go b/pkg/providers/instance/instance_test.go index c81d9da0..c7a154fb 100644 --- a/pkg/providers/instance/instance_test.go +++ b/pkg/providers/instance/instance_test.go @@ -807,6 +807,166 @@ func TestCreateFailure(t *testing.T) { } } +func TestNewAgentPoolObjectWithImageFamily(t *testing.T) { + testCases := []struct { + name string + vmSize string + nodeClaim *karpenterv1.NodeClaim + expectedOSSKU armcontainerservice.OSSKU + }{ + { + name: "NodeClaim with AzureLinux image family label", + vmSize: "Standard_NC6s_v3", + nodeClaim: &karpenterv1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-nodeclaim", + Labels: map[string]string{ + "kaito.sh/node-image-family": "AzureLinux", + }, + }, + Spec: karpenterv1.NodeClaimSpec{ + Resources: karpenterv1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceStorage: *resource.NewQuantity(30*1024*1024*1024, resource.DecimalSI), + }, + }, + }, + }, + expectedOSSKU: armcontainerservice.OSSKUAzureLinux, + }, + { + name: "NodeClaim with Ubuntu image family label", + vmSize: "Standard_NC6s_v3", + nodeClaim: &karpenterv1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-nodeclaim", + Labels: map[string]string{ + "kaito.sh/node-image-family": "Ubuntu", + }, + }, + Spec: karpenterv1.NodeClaimSpec{ + Resources: karpenterv1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceStorage: *resource.NewQuantity(30*1024*1024*1024, resource.DecimalSI), + }, + }, + }, + }, + expectedOSSKU: armcontainerservice.OSSKUUbuntu, + }, + { + name: "NodeClaim with Ubuntu2204 image family label", + vmSize: "Standard_NC6s_v3", + nodeClaim: &karpenterv1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-nodeclaim", + Labels: map[string]string{ + "kaito.sh/node-image-family": "Ubuntu2204", + }, + }, + Spec: karpenterv1.NodeClaimSpec{ + Resources: karpenterv1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceStorage: *resource.NewQuantity(30*1024*1024*1024, resource.DecimalSI), + }, + }, + }, + }, + expectedOSSKU: armcontainerservice.OSSKUUbuntu, + }, + { + name: "NodeClaim with AzureLinux image family annotation", + vmSize: "Standard_NC6s_v3", + nodeClaim: &karpenterv1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-nodeclaim", + Annotations: map[string]string{ + "kaito.sh/node-image-family": "AzureLinux", + }, + }, + Spec: karpenterv1.NodeClaimSpec{ + Resources: karpenterv1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceStorage: *resource.NewQuantity(30*1024*1024*1024, resource.DecimalSI), + }, + }, + }, + }, + expectedOSSKU: armcontainerservice.OSSKUAzureLinux, + }, + { + name: "NodeClaim with unknown image family defaults to Ubuntu", + vmSize: "Standard_NC6s_v3", + nodeClaim: &karpenterv1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-nodeclaim", + Labels: map[string]string{ + "kaito.sh/node-image-family": "Unknown", + }, + }, + Spec: karpenterv1.NodeClaimSpec{ + Resources: karpenterv1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceStorage: *resource.NewQuantity(30*1024*1024*1024, resource.DecimalSI), + }, + }, + }, + }, + expectedOSSKU: armcontainerservice.OSSKUUbuntu, + }, + { + name: "NodeClaim without image family label defaults to Ubuntu", + vmSize: "Standard_NC6s_v3", + nodeClaim: &karpenterv1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-nodeclaim", + }, + Spec: karpenterv1.NodeClaimSpec{ + Resources: karpenterv1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceStorage: *resource.NewQuantity(30*1024*1024*1024, resource.DecimalSI), + }, + }, + }, + }, + expectedOSSKU: armcontainerservice.OSSKUUbuntu, + }, + { + name: "Label takes precedence over annotation", + vmSize: "Standard_NC6s_v3", + nodeClaim: &karpenterv1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-nodeclaim", + Labels: map[string]string{ + "kaito.sh/node-image-family": "AzureLinux", + }, + Annotations: map[string]string{ + "kaito.sh/node-image-family": "Ubuntu", + }, + }, + Spec: karpenterv1.NodeClaimSpec{ + Resources: karpenterv1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceStorage: *resource.NewQuantity(30*1024*1024*1024, resource.DecimalSI), + }, + }, + }, + }, + expectedOSSKU: armcontainerservice.OSSKUAzureLinux, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + result, err := newAgentPoolObject(tc.vmSize, tc.nodeClaim) + + assert.NoError(t, err) + assert.Equal(t, tc.expectedOSSKU, *result.Properties.OSSKU) + assert.Equal(t, armcontainerservice.OSTypeLinux, *result.Properties.OSType) + }) + } +} + func createTestProvider(agentPoolsAPIMocks *fake.MockAgentPoolsAPI, mockK8sClient *fake.MockClient) *Provider { mockAzClient := NewAZClientFromAPI(agentPoolsAPIMocks) return NewProvider(mockAzClient, mockK8sClient, "testRG", "testCluster") @@ -842,6 +1002,7 @@ func GetAgentPoolObjWithName(apName string, apId string, vmSize string) armconta }, } } + func GetNodeList(nodes []v1.Node) *v1.NodeList { return &v1.NodeList{ Items: nodes, diff --git a/test/e2e/suites/suite_test.go b/test/e2e/suites/suite_test.go index 4fa18b72..95dc9dab 100644 --- a/test/e2e/suites/suite_test.go +++ b/test/e2e/suites/suite_test.go @@ -315,4 +315,512 @@ var _ = Describe("GPU NodeClaim", func() { env.ExpectDeleted(node) }) + It("should provision one GPU node with Azure Linux via label", func() { + nodeClaimLabels := map[string]string{ + "karpenter.sh/provisioner-name": "default", + "kaito.sh/workspace": "azure-linux-test", + "kaito.sh/node-image-family": "AzureLinux", + } + + nc := test.NodeClaim(karpenterv1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "azlinuxtestnc", + Labels: nodeClaimLabels, + }, + Spec: karpenterv1.NodeClaimSpec{ + NodeClassRef: &karpenterv1.NodeClassReference{ + Name: "default", + Kind: "AKSNodeClass", + }, + Resources: karpenterv1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceStorage: lo.FromPtr(resource.NewQuantity(120*1024*1024*1024, resource.DecimalSI)), + }, + }, + Requirements: []karpenterv1.NodeSelectorRequirementWithMinValues{ + { + NodeSelectorRequirement: v1.NodeSelectorRequirement{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"Standard_NC12s_v3"}, + }, + }, + { + NodeSelectorRequirement: v1.NodeSelectorRequirement{ + Key: karpenterv1.NodePoolLabelKey, + Operator: v1.NodeSelectorOpIn, + Values: []string{"kaito"}, + }, + }, + { + NodeSelectorRequirement: v1.NodeSelectorRequirement{ + Key: v1.LabelOSStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"linux"}, + }, + }, + }, + Taints: []v1.Taint{ + { + Key: "sku", + Value: "gpu", + Effect: v1.TaintEffectNoSchedule, + }, + }, + }, + }) + + DeferCleanup(func() { + env.ExpectDeleted(nc) + env.EventuallyExpectCreatedNodeClaimCount("==", 0) + env.EventuallyExpectNodeCount("==", 0) + }) + + env.ExpectCreated(nc) + env.EventuallyExpectCreatedNodeClaimCount("==", 1) + env.EventuallyExpectNodeClaimsReady(nc) + env.EventuallyExpectNodeCount("==", 1) + node := env.EventuallyExpectInitializedNodeCount("==", 1)[0] + + // Verify the node is running Azure Linux + Expect(node.Status.NodeInfo.OSImage).To(ContainSubstring("Azure"), + "Node should be running Azure Linux, got OS: %s", node.Status.NodeInfo.OSImage) + }) + + It("should provision one GPU node with Azure Linux via annotation", func() { + nodeClaimLabels := map[string]string{ + "karpenter.sh/provisioner-name": "default", + "kaito.sh/workspace": "azure-linux-annotation-test", + } + + nodeClaimAnnotations := map[string]string{ + "kaito.sh/node-image-family": "AzureLinux", + } + + nc := test.NodeClaim(karpenterv1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "azlinuxannotationtestnc", + Labels: nodeClaimLabels, + Annotations: nodeClaimAnnotations, + }, + Spec: karpenterv1.NodeClaimSpec{ + NodeClassRef: &karpenterv1.NodeClassReference{ + Name: "default", + Kind: "AKSNodeClass", + }, + Resources: karpenterv1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceStorage: lo.FromPtr(resource.NewQuantity(120*1024*1024*1024, resource.DecimalSI)), + }, + }, + Requirements: []karpenterv1.NodeSelectorRequirementWithMinValues{ + { + NodeSelectorRequirement: v1.NodeSelectorRequirement{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"Standard_NC12s_v3"}, + }, + }, + { + NodeSelectorRequirement: v1.NodeSelectorRequirement{ + Key: karpenterv1.NodePoolLabelKey, + Operator: v1.NodeSelectorOpIn, + Values: []string{"kaito"}, + }, + }, + { + NodeSelectorRequirement: v1.NodeSelectorRequirement{ + Key: v1.LabelOSStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"linux"}, + }, + }, + }, + Taints: []v1.Taint{ + { + Key: "sku", + Value: "gpu", + Effect: v1.TaintEffectNoSchedule, + }, + }, + }, + }) + + DeferCleanup(func() { + env.ExpectDeleted(nc) + env.EventuallyExpectCreatedNodeClaimCount("==", 0) + env.EventuallyExpectNodeCount("==", 0) + }) + + env.ExpectCreated(nc) + env.EventuallyExpectCreatedNodeClaimCount("==", 1) + env.EventuallyExpectNodeClaimsReady(nc) + env.EventuallyExpectNodeCount("==", 1) + node := env.EventuallyExpectInitializedNodeCount("==", 1)[0] + + // Verify the node is running Azure Linux + Expect(node.Status.NodeInfo.OSImage).To(ContainSubstring("Azure"), + "Node should be running Azure Linux, got OS: %s", node.Status.NodeInfo.OSImage) + }) + + It("should handle case-insensitive Azure Linux image family values", func() { + nodeClaimLabels := map[string]string{ + "karpenter.sh/provisioner-name": "default", + "kaito.sh/workspace": "azure-linux-case-test", + "kaito.sh/node-image-family": "azurelinux", // lowercase + } + + nc := test.NodeClaim(karpenterv1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "azlinuxcasetestnc", + Labels: nodeClaimLabels, + }, + Spec: karpenterv1.NodeClaimSpec{ + NodeClassRef: &karpenterv1.NodeClassReference{ + Name: "default", + Kind: "AKSNodeClass", + }, + Resources: karpenterv1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceStorage: lo.FromPtr(resource.NewQuantity(120*1024*1024*1024, resource.DecimalSI)), + }, + }, + Requirements: []karpenterv1.NodeSelectorRequirementWithMinValues{ + { + NodeSelectorRequirement: v1.NodeSelectorRequirement{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"Standard_NC12s_v3"}, + }, + }, + { + NodeSelectorRequirement: v1.NodeSelectorRequirement{ + Key: karpenterv1.NodePoolLabelKey, + Operator: v1.NodeSelectorOpIn, + Values: []string{"kaito"}, + }, + }, + { + NodeSelectorRequirement: v1.NodeSelectorRequirement{ + Key: v1.LabelOSStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"linux"}, + }, + }, + }, + Taints: []v1.Taint{ + { + Key: "sku", + Value: "gpu", + Effect: v1.TaintEffectNoSchedule, + }, + }, + }, + }) + + DeferCleanup(func() { + env.ExpectDeleted(nc) + env.EventuallyExpectCreatedNodeClaimCount("==", 0) + env.EventuallyExpectNodeCount("==", 0) + }) + + env.ExpectCreated(nc) + env.EventuallyExpectCreatedNodeClaimCount("==", 1) + env.EventuallyExpectNodeClaimsReady(nc) + env.EventuallyExpectNodeCount("==", 1) + node := env.EventuallyExpectInitializedNodeCount("==", 1)[0] + + // Verify the node is running Azure Linux (case-insensitive support) + Expect(node.Status.NodeInfo.OSImage).To(ContainSubstring("Azure"), + "Node should be running Azure Linux with case-insensitive support, got OS: %s", node.Status.NodeInfo.OSImage) + }) + + It("should fallback to Ubuntu when invalid image family is specified", func() { + nodeClaimLabels := map[string]string{ + "karpenter.sh/provisioner-name": "default", + "kaito.sh/workspace": "fallback-test", + "kaito.sh/node-image-family": "InvalidImageFamily", + } + + nc := test.NodeClaim(karpenterv1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "fallbacktestnc", + Labels: nodeClaimLabels, + }, + Spec: karpenterv1.NodeClaimSpec{ + NodeClassRef: &karpenterv1.NodeClassReference{ + Name: "default", + Kind: "AKSNodeClass", + }, + Resources: karpenterv1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceStorage: lo.FromPtr(resource.NewQuantity(120*1024*1024*1024, resource.DecimalSI)), + }, + }, + Requirements: []karpenterv1.NodeSelectorRequirementWithMinValues{ + { + NodeSelectorRequirement: v1.NodeSelectorRequirement{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"Standard_NC12s_v3"}, + }, + }, + { + NodeSelectorRequirement: v1.NodeSelectorRequirement{ + Key: karpenterv1.NodePoolLabelKey, + Operator: v1.NodeSelectorOpIn, + Values: []string{"kaito"}, + }, + }, + { + NodeSelectorRequirement: v1.NodeSelectorRequirement{ + Key: v1.LabelOSStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"linux"}, + }, + }, + }, + Taints: []v1.Taint{ + { + Key: "sku", + Value: "gpu", + Effect: v1.TaintEffectNoSchedule, + }, + }, + }, + }) + + DeferCleanup(func() { + env.ExpectDeleted(nc) + env.EventuallyExpectCreatedNodeClaimCount("==", 0) + env.EventuallyExpectNodeCount("==", 0) + }) + + env.ExpectCreated(nc) + env.EventuallyExpectCreatedNodeClaimCount("==", 1) + env.EventuallyExpectNodeClaimsReady(nc) + env.EventuallyExpectNodeCount("==", 1) + node := env.EventuallyExpectInitializedNodeCount("==", 1)[0] + + // Verify the node falls back to Ubuntu when invalid image family is specified + Expect(node.Status.NodeInfo.OSImage).To(ContainSubstring("Ubuntu"), + "Node should fallback to Ubuntu when invalid image family is specified, got OS: %s", node.Status.NodeInfo.OSImage) + }) + + It("should terminate node when delete triggered", func() { + nodeClaimLabels := map[string]string{ + "karpenter.sh/provisioner-name": "default", + "kaito.sh/workspace": "none", + } + + nc := test.NodeClaim(karpenterv1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "wctestnc5", + Labels: nodeClaimLabels, + }, + Spec: karpenterv1.NodeClaimSpec{ + NodeClassRef: &karpenterv1.NodeClassReference{ + Name: "default", + Kind: "AKSNodeClass", + }, + Resources: karpenterv1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceStorage: lo.FromPtr(resource.NewQuantity(120*1024*1024*1024, resource.DecimalSI)), + }, + }, + Requirements: []karpenterv1.NodeSelectorRequirementWithMinValues{ + { + NodeSelectorRequirement: v1.NodeSelectorRequirement{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"Standard_NC12s_v3"}, + }, + }, + { + NodeSelectorRequirement: v1.NodeSelectorRequirement{ + Key: karpenterv1.NodePoolLabelKey, + Operator: v1.NodeSelectorOpIn, + Values: []string{"kaito"}, + }, + }, + { + NodeSelectorRequirement: v1.NodeSelectorRequirement{ + Key: v1.LabelOSStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"linux"}, + }, + }, + }, + Taints: []v1.Taint{ + { + Key: "sku", + Value: "gpu", + Effect: v1.TaintEffectNoSchedule, + }, + }, + }, + }) + + DeferCleanup(func() { + env.ExpectDeleted(nc) + env.EventuallyExpectCreatedNodeClaimCount("==", 0) + env.EventuallyExpectNodeCount("==", 0) + }) + + env.ExpectCreated(nc) + env.EventuallyExpectCreatedNodeClaimCount("==", 1) + env.EventuallyExpectNodeClaimsReady(nc) + env.EventuallyExpectNodeCount("==", 1) + node := env.EventuallyExpectInitializedNodeCount("==", 1)[0] + + // delete node for triggering terminate all resrouces like NodeClaim, CloudProvider Instance + env.ExpectDeleted(node) + }) + + It("should terminate node when delete triggered (Azure Linux)", func() { + nodeClaimLabels := map[string]string{ + "karpenter.sh/provisioner-name": "default", + "kaito.sh/workspace": "azure-linux-test", + "kaito.sh/node-image-family": "AzureLinux", + } + + nc := test.NodeClaim(karpenterv1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "azlinuxtestnc2", + Labels: nodeClaimLabels, + }, + Spec: karpenterv1.NodeClaimSpec{ + NodeClassRef: &karpenterv1.NodeClassReference{ + Name: "default", + Kind: "AKSNodeClass", + }, + Resources: karpenterv1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceStorage: lo.FromPtr(resource.NewQuantity(120*1024*1024*1024, resource.DecimalSI)), + }, + }, + Requirements: []karpenterv1.NodeSelectorRequirementWithMinValues{ + { + NodeSelectorRequirement: v1.NodeSelectorRequirement{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"Standard_NC12s_v3"}, + }, + }, + { + NodeSelectorRequirement: v1.NodeSelectorRequirement{ + Key: karpenterv1.NodePoolLabelKey, + Operator: v1.NodeSelectorOpIn, + Values: []string{"kaito"}, + }, + }, + { + NodeSelectorRequirement: v1.NodeSelectorRequirement{ + Key: v1.LabelOSStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"linux"}, + }, + }, + }, + Taints: []v1.Taint{ + { + Key: "sku", + Value: "gpu", + Effect: v1.TaintEffectNoSchedule, + }, + }, + }, + }) + + DeferCleanup(func() { + env.ExpectDeleted(nc) + env.EventuallyExpectCreatedNodeClaimCount("==", 0) + env.EventuallyExpectNodeCount("==", 0) + }) + + env.ExpectCreated(nc) + env.EventuallyExpectCreatedNodeClaimCount("==", 1) + env.EventuallyExpectNodeClaimsReady(nc) + env.EventuallyExpectNodeCount("==", 1) + node := env.EventuallyExpectInitializedNodeCount("==", 1)[0] + + // delete node for triggering terminate all resrouces like NodeClaim, CloudProvider Instance + env.ExpectDeleted(node) + }) + + It("should terminate node when delete triggered (Azure Linux - annotation)", func() { + nodeClaimLabels := map[string]string{ + "karpenter.sh/provisioner-name": "default", + "kaito.sh/workspace": "azure-linux-annotation-test", + } + + nodeClaimAnnotations := map[string]string{ + "kaito.sh/node-image-family": "AzureLinux", + } + + nc := test.NodeClaim(karpenterv1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "azlinuxannotationtestnc2", + Labels: nodeClaimLabels, + Annotations: nodeClaimAnnotations, + }, + Spec: karpenterv1.NodeClaimSpec{ + NodeClassRef: &karpenterv1.NodeClassReference{ + Name: "default", + Kind: "AKSNodeClass", + }, + Resources: karpenterv1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceStorage: lo.FromPtr(resource.NewQuantity(120*1024*1024*1024, resource.DecimalSI)), + }, + }, + Requirements: []karpenterv1.NodeSelectorRequirementWithMinValues{ + { + NodeSelectorRequirement: v1.NodeSelectorRequirement{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"Standard_NC12s_v3"}, + }, + }, + { + NodeSelectorRequirement: v1.NodeSelectorRequirement{ + Key: karpenterv1.NodePoolLabelKey, + Operator: v1.NodeSelectorOpIn, + Values: []string{"kaito"}, + }, + }, + { + NodeSelectorRequirement: v1.NodeSelectorRequirement{ + Key: v1.LabelOSStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"linux"}, + }, + }, + }, + Taints: []v1.Taint{ + { + Key: "sku", + Value: "gpu", + Effect: v1.TaintEffectNoSchedule, + }, + }, + }, + }) + + DeferCleanup(func() { + env.ExpectDeleted(nc) + env.EventuallyExpectCreatedNodeClaimCount("==", 0) + env.EventuallyExpectNodeCount("==", 0) + }) + + env.ExpectCreated(nc) + env.EventuallyExpectCreatedNodeClaimCount("==", 1) + env.EventuallyExpectNodeClaimsReady(nc) + env.EventuallyExpectNodeCount("==", 1) + node := env.EventuallyExpectInitializedNodeCount("==", 1)[0] + + // delete node for triggering terminate all resrouces like NodeClaim, CloudProvider Instance + env.ExpectDeleted(node) + }) + }) From 752abf029011d9d155dab6315fe4000bb387ab9e Mon Sep 17 00:00:00 2001 From: Sudhanva Huruli Date: Thu, 3 Jul 2025 04:40:07 +0000 Subject: [PATCH 2/3] fix nodepool name lengths causing test cases to fial --- examples/azure-linux-annotation-nodeclaim.yaml | 6 +++--- examples/azure-linux-nodeclaim.yaml | 6 +++--- test/e2e/suites/suite_test.go | 12 ++++++------ 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/examples/azure-linux-annotation-nodeclaim.yaml b/examples/azure-linux-annotation-nodeclaim.yaml index 68dbff07..0cb7b755 100644 --- a/examples/azure-linux-annotation-nodeclaim.yaml +++ b/examples/azure-linux-annotation-nodeclaim.yaml @@ -1,7 +1,7 @@ apiVersion: karpenter.sh/v1 kind: NodeClaim metadata: - name: azure-linux-annotation-nodeclaim + name: azlv2 labels: kaito.sh/workspace: "mistral-azure-linux" annotations: @@ -10,9 +10,9 @@ metadata: description: "NodeClaim using Azure Linux via annotation" spec: nodeClassRef: - apiVersion: karpenter.azure.com/v1alpha1 + name: gpuonly1 kind: AKSNodeClass - name: default + group: karpenter.azure.com requirements: # Larger GPU instance for more demanding models - key: node.kubernetes.io/instance-type diff --git a/examples/azure-linux-nodeclaim.yaml b/examples/azure-linux-nodeclaim.yaml index 5a707d50..ed3d0954 100644 --- a/examples/azure-linux-nodeclaim.yaml +++ b/examples/azure-linux-nodeclaim.yaml @@ -1,15 +1,15 @@ apiVersion: karpenter.sh/v1 kind: NodeClaim metadata: - name: azure-linux-gpu-nodeclaim + name: azlinuxgpu labels: kaito.sh/workspace: "llama-azure-linux" kaito.sh/node-image-family: "AzureLinux" spec: nodeClassRef: - apiVersion: karpenter.azure.com/v1alpha1 + name: gpuonly1 kind: AKSNodeClass - name: default + group: karpenter.azure.com requirements: # GPU instance type - key: node.kubernetes.io/instance-type diff --git a/test/e2e/suites/suite_test.go b/test/e2e/suites/suite_test.go index 95dc9dab..50830751 100644 --- a/test/e2e/suites/suite_test.go +++ b/test/e2e/suites/suite_test.go @@ -324,7 +324,7 @@ var _ = Describe("GPU NodeClaim", func() { nc := test.NodeClaim(karpenterv1.NodeClaim{ ObjectMeta: metav1.ObjectMeta{ - Name: "azlinuxtestnc", + Name: "azlinuxnc", Labels: nodeClaimLabels, }, Spec: karpenterv1.NodeClaimSpec{ @@ -399,7 +399,7 @@ var _ = Describe("GPU NodeClaim", func() { nc := test.NodeClaim(karpenterv1.NodeClaim{ ObjectMeta: metav1.ObjectMeta{ - Name: "azlinuxannotationtestnc", + Name: "azlinuxannot", Labels: nodeClaimLabels, Annotations: nodeClaimAnnotations, }, @@ -472,7 +472,7 @@ var _ = Describe("GPU NodeClaim", func() { nc := test.NodeClaim(karpenterv1.NodeClaim{ ObjectMeta: metav1.ObjectMeta{ - Name: "azlinuxcasetestnc", + Name: "azlinuxcase", Labels: nodeClaimLabels, }, Spec: karpenterv1.NodeClaimSpec{ @@ -544,7 +544,7 @@ var _ = Describe("GPU NodeClaim", func() { nc := test.NodeClaim(karpenterv1.NodeClaim{ ObjectMeta: metav1.ObjectMeta{ - Name: "fallbacktestnc", + Name: "fallbacknc", Labels: nodeClaimLabels, }, Spec: karpenterv1.NodeClaimSpec{ @@ -686,7 +686,7 @@ var _ = Describe("GPU NodeClaim", func() { nc := test.NodeClaim(karpenterv1.NodeClaim{ ObjectMeta: metav1.ObjectMeta{ - Name: "azlinuxtestnc2", + Name: "azlinuxnc2", Labels: nodeClaimLabels, }, Spec: karpenterv1.NodeClaimSpec{ @@ -760,7 +760,7 @@ var _ = Describe("GPU NodeClaim", func() { nc := test.NodeClaim(karpenterv1.NodeClaim{ ObjectMeta: metav1.ObjectMeta{ - Name: "azlinuxannotationtestnc2", + Name: "azlinuxann2", Labels: nodeClaimLabels, Annotations: nodeClaimAnnotations, }, From 1249efd78aa8cecb4fa6644c1f7eb8de88f7bfcb Mon Sep 17 00:00:00 2001 From: Sudhanva Huruli Date: Thu, 3 Jul 2025 16:31:19 +0000 Subject: [PATCH 3/3] Ensure test cases also check for Mariner name --- test/e2e/suites/suite_test.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/e2e/suites/suite_test.go b/test/e2e/suites/suite_test.go index 50830751..697b41e5 100644 --- a/test/e2e/suites/suite_test.go +++ b/test/e2e/suites/suite_test.go @@ -383,7 +383,7 @@ var _ = Describe("GPU NodeClaim", func() { node := env.EventuallyExpectInitializedNodeCount("==", 1)[0] // Verify the node is running Azure Linux - Expect(node.Status.NodeInfo.OSImage).To(ContainSubstring("Azure"), + Expect(node.Status.NodeInfo.OSImage).To(Or(ContainSubstring("Azure"), ContainSubstring("Mariner")), "Node should be running Azure Linux, got OS: %s", node.Status.NodeInfo.OSImage) }) @@ -459,7 +459,7 @@ var _ = Describe("GPU NodeClaim", func() { node := env.EventuallyExpectInitializedNodeCount("==", 1)[0] // Verify the node is running Azure Linux - Expect(node.Status.NodeInfo.OSImage).To(ContainSubstring("Azure"), + Expect(node.Status.NodeInfo.OSImage).To(Or(ContainSubstring("Azure"), ContainSubstring("Mariner")), "Node should be running Azure Linux, got OS: %s", node.Status.NodeInfo.OSImage) }) @@ -531,7 +531,7 @@ var _ = Describe("GPU NodeClaim", func() { node := env.EventuallyExpectInitializedNodeCount("==", 1)[0] // Verify the node is running Azure Linux (case-insensitive support) - Expect(node.Status.NodeInfo.OSImage).To(ContainSubstring("Azure"), + Expect(node.Status.NodeInfo.OSImage).To(Or(ContainSubstring("Azure"), ContainSubstring("Mariner")), "Node should be running Azure Linux with case-insensitive support, got OS: %s", node.Status.NodeInfo.OSImage) })