From f642f8a7a044b8c2b28057b5836791aaf2ff197b Mon Sep 17 00:00:00 2001 From: are-ces <195810094+are-ces@users.noreply.github.com> Date: Tue, 13 Jan 2026 09:34:20 +0100 Subject: [PATCH] Add GPU support for RHOAI e2e tests in prow - Enabled GPU for vLLM model serving in the Prow e2e test environment. - Add GPU node tolerations to ensure pods are scheduled on GPU enabled nodes. - Use dynamic llama-stack image built from the LCS repository Containerfile. - Use latest vLLM image from the Red Hat template. --- tests/e2e-prow/rhoai/configs/run.yaml | 10 +- .../rhoai/manifests/gpu/cluster-policy.yaml | 31 +++ .../rhoai/manifests/gpu/create-nfd.yaml | 8 + .../manifests/lightspeed/llama-stack.yaml | 4 +- .../rhoai/manifests/namespaces/nfd.yaml | 5 + .../manifests/namespaces/nvidia-operator.yaml | 5 + .../manifests/operators/operatorgroup.yaml | 22 +- .../rhoai/manifests/operators/operators.yaml | 27 ++- ...e.yaml => vllm-inference-service-cpu.yaml} | 0 .../vllm/vllm-inference-service-gpu.yaml | 25 +++ .../manifests/vllm/vllm-runtime-gpu.yaml | 82 +++++++ tests/e2e-prow/rhoai/pipeline-services.sh | 4 +- tests/e2e-prow/rhoai/pipeline-vllm.sh | 4 + tests/e2e-prow/rhoai/pipeline.sh | 92 ++++++-- tests/e2e-prow/rhoai/scripts/bootstrap.sh | 57 ++++- tests/e2e-prow/rhoai/scripts/deploy-vllm.sh | 48 +++- .../rhoai/scripts/fetch-vllm-image.sh | 36 +++ .../rhoai/scripts/get-vllm-pod-info.sh | 102 +++++++-- tests/e2e-prow/rhoai/scripts/gpu-setup.sh | 208 ++++++++++++++++++ 19 files changed, 722 insertions(+), 48 deletions(-) create mode 100644 tests/e2e-prow/rhoai/manifests/gpu/cluster-policy.yaml create mode 100644 tests/e2e-prow/rhoai/manifests/gpu/create-nfd.yaml create mode 100644 tests/e2e-prow/rhoai/manifests/namespaces/nfd.yaml create mode 100644 tests/e2e-prow/rhoai/manifests/namespaces/nvidia-operator.yaml rename tests/e2e-prow/rhoai/manifests/vllm/{inference-service.yaml => vllm-inference-service-cpu.yaml} (100%) create mode 100644 tests/e2e-prow/rhoai/manifests/vllm/vllm-inference-service-gpu.yaml create mode 100644 tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-gpu.yaml create mode 100755 tests/e2e-prow/rhoai/scripts/fetch-vllm-image.sh create mode 100755 tests/e2e-prow/rhoai/scripts/gpu-setup.sh diff --git a/tests/e2e-prow/rhoai/configs/run.yaml b/tests/e2e-prow/rhoai/configs/run.yaml index 645e88dc1..9e83a9fb4 100644 --- a/tests/e2e-prow/rhoai/configs/run.yaml +++ b/tests/e2e-prow/rhoai/configs/run.yaml @@ -36,10 +36,10 @@ providers: api_token: ${env.VLLM_API_KEY} tls_verify: false max_tokens: 1024 - - provider_id: openai - provider_type: remote::openai - config: - api_key: ${env.OPENAI_API_KEY} + # - provider_id: openai + # provider_type: remote::openai + # config: + # api_key: ${env.OPENAI_API_KEY} - config: {} provider_id: sentence-transformers provider_type: inline::sentence-transformers @@ -144,7 +144,7 @@ registered_resources: shields: - shield_id: llama-guard provider_id: llama-guard - provider_shield_id: openai/gpt-4o-mini + provider_shield_id: vllm/meta-llama/Llama-3.2-1B-Instruct datasets: [] scoring_fns: [] benchmarks: [] diff --git a/tests/e2e-prow/rhoai/manifests/gpu/cluster-policy.yaml b/tests/e2e-prow/rhoai/manifests/gpu/cluster-policy.yaml new file mode 100644 index 000000000..673a62ed4 --- /dev/null +++ b/tests/e2e-prow/rhoai/manifests/gpu/cluster-policy.yaml @@ -0,0 +1,31 @@ +apiVersion: nvidia.com/v1 +kind: ClusterPolicy +metadata: + name: gpu-cluster-policy +spec: + daemonsets: + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + - key: gpu + operator: Exists + effect: NoSchedule + operator: + defaultRuntime: crio + driver: + enabled: true + toolkit: + enabled: true + devicePlugin: + enabled: true + dcgm: + enabled: true + dcgmExporter: + enabled: true + gfd: + enabled: true + migManager: + enabled: false + nodeStatusExporter: + enabled: true \ No newline at end of file diff --git a/tests/e2e-prow/rhoai/manifests/gpu/create-nfd.yaml b/tests/e2e-prow/rhoai/manifests/gpu/create-nfd.yaml new file mode 100644 index 000000000..568a61e3c --- /dev/null +++ b/tests/e2e-prow/rhoai/manifests/gpu/create-nfd.yaml @@ -0,0 +1,8 @@ +# Minimal NFD instance +apiVersion: nfd.openshift.io/v1 +kind: NodeFeatureDiscovery +metadata: + name: nfd-instance + namespace: openshift-nfd +spec: + instance: "" diff --git a/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack.yaml b/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack.yaml index b228ab650..005f96978 100644 --- a/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack.yaml +++ b/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack.yaml @@ -4,6 +4,8 @@ metadata: name: llama-stack-service namespace: e2e-rhoai-dsc spec: + imagePullSecrets: + - name: quay-lightspeed-pull-secret containers: - name: llama-stack-container env: @@ -17,7 +19,7 @@ spec: secretKeyRef: name: vllm-api-key-secret key: key - image: quay.io/opendatahub/llama-stack:rhoai-v2.25-latest + image: ${LLAMA_STACK_IMAGE} ports: - containerPort: 8321 volumeMounts: diff --git a/tests/e2e-prow/rhoai/manifests/namespaces/nfd.yaml b/tests/e2e-prow/rhoai/manifests/namespaces/nfd.yaml new file mode 100644 index 000000000..b15d50c6f --- /dev/null +++ b/tests/e2e-prow/rhoai/manifests/namespaces/nfd.yaml @@ -0,0 +1,5 @@ +# NFD Namespace +apiVersion: v1 +kind: Namespace +metadata: + name: openshift-nfd \ No newline at end of file diff --git a/tests/e2e-prow/rhoai/manifests/namespaces/nvidia-operator.yaml b/tests/e2e-prow/rhoai/manifests/namespaces/nvidia-operator.yaml new file mode 100644 index 000000000..81288f3a4 --- /dev/null +++ b/tests/e2e-prow/rhoai/manifests/namespaces/nvidia-operator.yaml @@ -0,0 +1,5 @@ +# NVIDIA GPU Operator Namespace +apiVersion: v1 +kind: Namespace +metadata: + name: nvidia-gpu-operator diff --git a/tests/e2e-prow/rhoai/manifests/operators/operatorgroup.yaml b/tests/e2e-prow/rhoai/manifests/operators/operatorgroup.yaml index 7ed06cac5..dd6b119c0 100644 --- a/tests/e2e-prow/rhoai/manifests/operators/operatorgroup.yaml +++ b/tests/e2e-prow/rhoai/manifests/operators/operatorgroup.yaml @@ -3,4 +3,24 @@ kind: OperatorGroup metadata: name: global-operators namespace: openshift-operators -spec: \ No newline at end of file +spec: +--- +# NVIDIA GPU Operator OperatorGroup +apiVersion: operators.coreos.com/v1 +kind: OperatorGroup +metadata: + name: nvidia-gpu-operator-group + namespace: nvidia-gpu-operator +spec: + targetNamespaces: + - nvidia-gpu-operator +--- +# Create OperatorGroup +apiVersion: operators.coreos.com/v1 +kind: OperatorGroup +metadata: + name: nfd-operator-group + namespace: openshift-nfd +spec: + targetNamespaces: + - openshift-nfd \ No newline at end of file diff --git a/tests/e2e-prow/rhoai/manifests/operators/operators.yaml b/tests/e2e-prow/rhoai/manifests/operators/operators.yaml index 2da92a08e..9eb113b82 100644 --- a/tests/e2e-prow/rhoai/manifests/operators/operators.yaml +++ b/tests/e2e-prow/rhoai/manifests/operators/operators.yaml @@ -32,4 +32,29 @@ spec: channel: stable name: rhods-operator source: redhat-operators - sourceNamespace: openshift-marketplace \ No newline at end of file + sourceNamespace: openshift-marketplace +--- +# NVIDIA GPU Operator Subscription +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + name: gpu-operator-certified + namespace: nvidia-gpu-operator +spec: + channel: stable + installPlanApproval: Automatic + name: gpu-operator-certified + source: certified-operators + sourceNamespace: openshift-marketplace +--- +# Node Feature Discovery (NFD) Operator Subscription +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + name: nfd + namespace: openshift-nfd +spec: + channel: stable + name: nfd + source: redhat-operators + sourceNamespace: openshift-marketplace diff --git a/tests/e2e-prow/rhoai/manifests/vllm/inference-service.yaml b/tests/e2e-prow/rhoai/manifests/vllm/vllm-inference-service-cpu.yaml similarity index 100% rename from tests/e2e-prow/rhoai/manifests/vllm/inference-service.yaml rename to tests/e2e-prow/rhoai/manifests/vllm/vllm-inference-service-cpu.yaml diff --git a/tests/e2e-prow/rhoai/manifests/vllm/vllm-inference-service-gpu.yaml b/tests/e2e-prow/rhoai/manifests/vllm/vllm-inference-service-gpu.yaml new file mode 100644 index 000000000..da2e3eb19 --- /dev/null +++ b/tests/e2e-prow/rhoai/manifests/vllm/vllm-inference-service-gpu.yaml @@ -0,0 +1,25 @@ +apiVersion: serving.kserve.io/v1beta1 +kind: InferenceService +metadata: + name: vllm-model + namespace: e2e-rhoai-dsc + annotations: + serving.kserve.io/deploymentMode: RawDeployment + sidecar.istio.io/inject: "false" +spec: + predictor: + minReplicas: 1 + maxReplicas: 1 + model: + modelFormat: + name: vLLM + runtime: vllm-gpu + resources: + limits: + nvidia.com/gpu: 1 + cpu: "4" + memory: 20Gi + requests: + nvidia.com/gpu: 1 + cpu: "2" + memory: 16Gi diff --git a/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-gpu.yaml b/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-gpu.yaml new file mode 100644 index 000000000..2027cfcf2 --- /dev/null +++ b/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-gpu.yaml @@ -0,0 +1,82 @@ +apiVersion: serving.kserve.io/v1alpha1 +kind: ServingRuntime +metadata: + annotations: + openshift.io/display-name: vLLM GPU + name: vllm-gpu + namespace: e2e-rhoai-dsc + labels: + opendatahub.io/dashboard: "true" +spec: + builtInAdapter: + modelLoadingTimeoutMillis: 90000 + containers: + - args: + - --model + - meta-llama/Llama-3.2-1B-Instruct + - --enable-auto-tool-choice + - --tool-call-parser + - llama3_json + - --chat-template + - /mnt/chat-template/tool_chat_template_llama3.2_json.jinja + - --download-dir + - /tmp/models-cache + - --port + - "8080" + - --max-model-len + - "2048" + - --gpu-memory-utilization + - "0.9" + image: ${VLLM_IMAGE} + name: kserve-container + env: + - name: HF_HUB_OFFLINE + value: "false" + - name: TRANSFORMERS_OFFLINE + value: "false" + - name: HF_DATASETS_OFFLINE + value: "false" + - name: HF_HOME + value: /mnt/models-cache/hf_home + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + - name: VLLM_API_KEY + valueFrom: + secretKeyRef: + name: vllm-api-key-secret + key: key + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + volumeMounts: + - name: chat-template + mountPath: /mnt/chat-template + - name: models-cache + mountPath: /mnt/models-cache + - name: vllm-cache + mountPath: /.cache + resources: + limits: + nvidia.com/gpu: 1 + cpu: "6" + memory: 20Gi + requests: + nvidia.com/gpu: 1 + cpu: "4" + memory: 16Gi + volumes: + - name: chat-template + configMap: + name: vllm-chat-template + - name: models-cache + emptyDir: {} + - name: vllm-cache + emptyDir: {} + multiModel: false + supportedModelFormats: + - autoSelect: true + name: vLLM diff --git a/tests/e2e-prow/rhoai/pipeline-services.sh b/tests/e2e-prow/rhoai/pipeline-services.sh index 832bff727..8d011bce7 100755 --- a/tests/e2e-prow/rhoai/pipeline-services.sh +++ b/tests/e2e-prow/rhoai/pipeline-services.sh @@ -2,10 +2,10 @@ BASE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -oc apply -f "$BASE_DIR/manifests/lightspeed/llama-stack.yaml" +envsubst < "$BASE_DIR/manifests/lightspeed/llama-stack.yaml" | oc apply -f - oc wait pod/llama-stack-service \ --n e2e-rhoai-dsc --for=condition=Ready --timeout=300s +-n e2e-rhoai-dsc --for=condition=Ready --timeout=600s # Get url address of llama-stack pod oc label pod llama-stack-service pod=llama-stack-service -n e2e-rhoai-dsc diff --git a/tests/e2e-prow/rhoai/pipeline-vllm.sh b/tests/e2e-prow/rhoai/pipeline-vllm.sh index 20dedf752..47248442c 100755 --- a/tests/e2e-prow/rhoai/pipeline-vllm.sh +++ b/tests/e2e-prow/rhoai/pipeline-vllm.sh @@ -1,7 +1,11 @@ #!/bin/bash +set -euo pipefail + PIPELINE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" "$PIPELINE_DIR/scripts/bootstrap.sh" "$PIPELINE_DIR" +"$PIPELINE_DIR/scripts/gpu-setup.sh" "$PIPELINE_DIR" +source "$PIPELINE_DIR/scripts/fetch-vllm-image.sh" "$PIPELINE_DIR/scripts/deploy-vllm.sh" "$PIPELINE_DIR" "$PIPELINE_DIR/scripts/get-vllm-pod-info.sh" \ No newline at end of file diff --git a/tests/e2e-prow/rhoai/pipeline.sh b/tests/e2e-prow/rhoai/pipeline.sh index 71f5a4910..718dc36ae 100755 --- a/tests/e2e-prow/rhoai/pipeline.sh +++ b/tests/e2e-prow/rhoai/pipeline.sh @@ -8,7 +8,17 @@ trap 'echo "❌ Pipeline failed at line $LINENO"; exit 1' ERR #======================================== NAMESPACE="e2e-rhoai-dsc" MODEL_NAME="meta-llama/Llama-3.2-1B-Instruct" - +PIPELINE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Get llama-stack image from GitHub Containerfile +echo "Fetching llama-stack image from GitHub..." +LLAMA_STACK_IMAGE=$(curl -sL https://raw.githubusercontent.com/lightspeed-core/lightspeed-stack/main/test.containerfile | grep -m1 '^FROM' | awk '{print $2}') +if [ -z "$LLAMA_STACK_IMAGE" ]; then + echo "❌ Failed to fetch llama-stack image from GitHub" + exit 1 +fi +echo " -> Found llama-stack image: $LLAMA_STACK_IMAGE" +export LLAMA_STACK_IMAGE #======================================== # 2. ENVIRONMENT SETUP @@ -16,9 +26,14 @@ MODEL_NAME="meta-llama/Llama-3.2-1B-Instruct" echo "===== Setting up environment variables =====" export HUGGING_FACE_HUB_TOKEN=$(cat /var/run/huggingface/hf-token-ces-lcore-test || true) export VLLM_API_KEY=$(cat /var/run/vllm/vllm-api-key-lcore-test || true) +export QUAY_ROBOT_NAME=$(cat /var/run/quay-aipcc-name/lcore-quay-name-lcore-test || true) +export QUAY_ROBOT_PASSWORD=$(cat /var/run/quay-aipcc-password/lcore-quay-password-lcore-test || true) + [[ -n "$HUGGING_FACE_HUB_TOKEN" ]] && echo "✅ HUGGING_FACE_HUB_TOKEN is set" || { echo "❌ Missing HUGGING_FACE_HUB_TOKEN"; exit 1; } [[ -n "$VLLM_API_KEY" ]] && echo "✅ VLLM_API_KEY is set" || { echo "❌ Missing VLLM_API_KEY"; exit 1; } +[[ -n "$QUAY_ROBOT_NAME" ]] && echo "✅ QUAY_ROBOT_NAME is set" || { echo "❌ Missing QUAY_ROBOT_NAME"; exit 1; } +[[ -n "$QUAY_ROBOT_PASSWORD" ]] && echo "✅ QUAY_ROBOT_PASSWORD is set" || { echo "❌ Missing QUAY_ROBOT_PASSWORD"; exit 1; } # Basic info ls -A || true @@ -31,6 +46,11 @@ oc whoami echo "===== Creating namespace & secrets =====" oc get ns "$NAMESPACE" >/dev/null 2>&1 || oc create namespace "$NAMESPACE" +# Create NFD and NVIDIA namespaces +oc apply -f "$PIPELINE_DIR/manifests/namespaces/nfd.yaml" +oc apply -f "$PIPELINE_DIR/manifests/namespaces/nvidia-operator.yaml" + + create_secret() { local name=$1; shift echo "Creating secret $name..." @@ -40,6 +60,17 @@ create_secret() { create_secret hf-token-secret --from-literal=token="$HUGGING_FACE_HUB_TOKEN" create_secret vllm-api-key-secret --from-literal=key="$VLLM_API_KEY" +# Create Quay pull secret for llama-stack images +echo "Creating Quay pull secret..." +oc create secret docker-registry quay-lightspeed-pull-secret \ + --docker-server=quay.io \ + --docker-username="$QUAY_ROBOT_NAME" \ + --docker-password="$QUAY_ROBOT_PASSWORD" \ + -n "$NAMESPACE" 2>/dev/null && echo "✅ Quay pull secret created" || echo "⚠️ Secret exists or creation failed" + +# Link the secret to default service account for image pulls +oc secrets link default quay-lightspeed-pull-secret --for=pull -n "$NAMESPACE" 2>/dev/null || echo "⚠️ Secret already linked to default SA" + #======================================== # 4. CONFIGMAPS @@ -73,26 +104,33 @@ start_time=$(date +%s) timeout=200 while true; do - response=$(curl -sk -w "%{http_code}" \ - -H "Content-Type: application/json" \ + # Create a temporary pod for testing (if it doesn't exist) + if ! oc get pod vllm-test-curl -n "$NAMESPACE" &>/dev/null; then + oc run vllm-test-curl --image=curlimages/curl:latest \ + --restart=Never -n "$NAMESPACE" -- sleep 3600 + oc wait --for=condition=Ready pod/vllm-test-curl -n "$NAMESPACE" --timeout=60s + fi + + # Execute curl inside the pod and capture response + response=$(oc exec vllm-test-curl -n "$NAMESPACE" -- \ + curl -sk -w '\n%{http_code}' \ + -H 'Content-Type: application/json' \ -H "Authorization: Bearer $VLLM_API_KEY" \ -d "{ \"model\": \"$MODEL_NAME\", \"prompt\": \"Who won the world series in 2020?\", \"max_new_tokens\": 100 }" \ - "$KSVC_URL/v1/completions") + "$KSVC_URL/v1/completions" 2>&1 || echo -e "\n000") - if [[ ${#response} -ge 3 ]]; then - http_code="${response: -3}" - body="${response:0:${#response}-3}" - else - http_code="000" - body="$response" - fi + # Extract HTTP code from last line + http_code=$(echo "$response" | tail -1 | tr -d '[:space:]') + # Extract body from all lines except last + body=$(echo "$response" | sed '$d') if [[ "$http_code" == "200" && "$body" == *'"object":"text_completion"'* ]]; then echo "✅ API test passed." + echo "$body" | jq . 2>/dev/null || echo "$body" break else echo "❌ API test failed (HTTP $http_code)" @@ -104,12 +142,16 @@ while true; do if (( elapsed >= timeout )); then echo "⏰ Timeout reached ($timeout seconds). Stopping test." + oc delete pod vllm-test-curl -n "$NAMESPACE" --ignore-not-found=true exit 1 fi sleep 20 done +# Cleanup test pod +oc delete pod vllm-test-curl -n "$NAMESPACE" --ignore-not-found=true + #======================================== # 7. DEPLOY LIGHTSPEED STACK AND LLAMA STACK @@ -123,8 +165,32 @@ oc create configmap test-script-cm -n "$NAMESPACE" --from-file=run-tests.sh ./pipeline-services.sh -oc wait pod/lightspeed-stack-service pod/llama-stack-service \ - -n "$NAMESPACE" --for=condition=Ready --timeout=300s +echo "--> Final wait for both lightspeed-stack-service and llama-stack-service pods..." +if ! oc wait pod/lightspeed-stack-service pod/llama-stack-service \ + -n "$NAMESPACE" --for=condition=Ready --timeout=600s; then + echo "" + echo "❌ One or both service pods failed to become ready within timeout" + echo "" + echo "DEBUG: Pod status:" + oc get pods -n "$NAMESPACE" -o wide || true + echo "" + echo "DEBUG: lightspeed-stack-service description:" + oc describe pod lightspeed-stack-service -n "$NAMESPACE" || true + echo "" + echo "DEBUG: llama-stack-service description:" + oc describe pod llama-stack-service -n "$NAMESPACE" || true + echo "" + echo "DEBUG: lightspeed-stack-service logs:" + oc logs lightspeed-stack-service -n "$NAMESPACE" --tail=100 || true + echo "" + echo "DEBUG: llama-stack-service logs:" + oc logs llama-stack-service -n "$NAMESPACE" --tail=100 || true + echo "" + echo "DEBUG: Recent events in namespace:" + oc get events -n "$NAMESPACE" --sort-by='.lastTimestamp' | tail -20 || true + exit 1 +fi +echo "✅ Both service pods are ready" sleep 30 oc get pods -n "$NAMESPACE" diff --git a/tests/e2e-prow/rhoai/scripts/bootstrap.sh b/tests/e2e-prow/rhoai/scripts/bootstrap.sh index 7a40ca56e..1718b70e5 100755 --- a/tests/e2e-prow/rhoai/scripts/bootstrap.sh +++ b/tests/e2e-prow/rhoai/scripts/bootstrap.sh @@ -20,9 +20,29 @@ wait_for_operator() { } # APPLY OPERATOR SUBSCRIPTIONS +echo "--> Applying OperatorGroups from operatorgroup.yaml..." +oc apply -f "$BASE_DIR/manifests/operators/operatorgroup.yaml" + +sleep 10 + echo "--> Applying Operator Subscriptions from operators.yaml..." oc apply -f "$BASE_DIR/manifests/operators/operators.yaml" +sleep 10 + +# WAIT FOR GPU OPERATOR NAMESPACE AND OPERATORGROUP +echo "--> Ensuring GPU Operator namespace and OperatorGroup are ready..." +oc wait --for=jsonpath='{.status.phase}'=Active namespace/nvidia-gpu-operator --timeout=60s +echo " -> Waiting for GPU OperatorGroup to be created..." +until oc get operatorgroup nvidia-gpu-operator-group -n nvidia-gpu-operator &>/dev/null; do + echo " ...still waiting for OperatorGroup" + sleep 2 +done +echo " -> GPU OperatorGroup ready" + +# Give OLM a moment to process the OperatorGroup before checking subscriptions +sleep 5 + # WAIT FOR OPERATORS TO BECOME READY echo "--> Waiting for Operators to be installed. This can take several minutes..." @@ -33,13 +53,42 @@ wait_for_operator "operators.coreos.com/servicemeshoperator.openshift-operators" wait_for_operator "operators.coreos.com/serverless-operator.openshift-operators" "openshift-operators" "Serverless Operator" wait_for_operator "operators.coreos.com/rhods-operator.openshift-operators" "openshift-operators" "RHODS Operator" +# Verify GPU operator InstallPlan was created before waiting for CSV +echo " -> Verifying GPU Operator InstallPlan was created..." +timeout=120 +elapsed=0 +until oc get installplan -n nvidia-gpu-operator --no-headers 2>/dev/null | grep -q .; do + if [ $elapsed -ge $timeout ]; then + echo " ❌ No InstallPlan created for GPU Operator - this is an OLM issue" + echo " Attempting to fix by recreating subscription..." + oc delete subscription gpu-operator-certified -n nvidia-gpu-operator + sleep 5 + oc apply -f "$BASE_DIR/manifests/operators/operators.yaml" + sleep 10 + # Try one more time + if ! oc get installplan -n nvidia-gpu-operator --no-headers 2>/dev/null | grep -q .; then + echo " ❌ Still no InstallPlan - manual intervention required" + exit 1 + fi + break + fi + echo " ...waiting for InstallPlan ($elapsed/$timeout seconds)" + sleep 5 + elapsed=$((elapsed + 5)) +done +echo " -> InstallPlan created successfully" + +wait_for_operator "operators.coreos.com/gpu-operator-certified.nvidia-gpu-operator" "nvidia-gpu-operator" "GPU Operator" +wait_for_operator "operators.coreos.com/nfd.openshift-nfd" "openshift-nfd" "NFD Operator" + +echo " -> Waiting for NFD CRD to be established..." +oc wait --for=condition=established --timeout=300s crd/nodefeaturediscoveries.nfd.openshift.io + echo "--> All operators are ready." oc get csv -n openshift-operators - -# APPLY DEPENDENT RESOURCES -echo "--> Applying OperatorGroup from operatorgroup.yaml..." -oc apply -f "$BASE_DIR/manifests/operators/operatorgroup.yaml" +oc get csv -n nvidia-gpu-operator +oc get csv -n openshift-nfd echo "--> Applying DataScienceCluster from ds-cluster.yaml..." oc apply -f "$BASE_DIR/manifests/operators/ds-cluster.yaml" diff --git a/tests/e2e-prow/rhoai/scripts/deploy-vllm.sh b/tests/e2e-prow/rhoai/scripts/deploy-vllm.sh index 4d31e663c..5c3201fa5 100755 --- a/tests/e2e-prow/rhoai/scripts/deploy-vllm.sh +++ b/tests/e2e-prow/rhoai/scripts/deploy-vllm.sh @@ -29,6 +29,50 @@ until oc get endpoints kserve-webhook-server-service -n redhat-ods-applications done echo "✅ KServe webhook service is ready." -oc apply -f "$BASE_DIR/manifests/vllm/vllm-runtime-cpu.yaml" +# Wait for GPU nodes to be labeled by NFD +echo "Waiting for GPU nodes to be labeled by NFD..." +timeout=600 # 10 minutes +elapsed=0 +until oc get nodes -l nvidia.com/gpu.present=true --no-headers 2>/dev/null | grep -q .; do + if [ $elapsed -ge $timeout ]; then + echo "❌ Timeout waiting for GPU nodes to be labeled" + exit 1 + fi + echo "No GPU nodes found yet. Waiting... ($elapsed/$timeout seconds)" + sleep 10 + elapsed=$((elapsed + 10)) +done +echo "✅ GPU nodes detected." + +# Wait for GPU capacity to be available +echo "Waiting for GPU capacity to be available on nodes..." +timeout=600 # 10 minutes +elapsed=0 +until [ "$(oc get nodes -l nvidia.com/gpu.present=true -o jsonpath='{.items[0].status.capacity.nvidia\.com/gpu}' 2>/dev/null)" != "" ] && \ + [ "$(oc get nodes -l nvidia.com/gpu.present=true -o jsonpath='{.items[0].status.capacity.nvidia\.com/gpu}' 2>/dev/null)" != "0" ]; do + if [ $elapsed -ge $timeout ]; then + echo "❌ Timeout waiting for GPU capacity" + echo "DEBUG: Checking GPU status..." + oc get nodes -l nvidia.com/gpu.present=true -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{"capacity: "}{.status.capacity.nvidia\.com/gpu}{"\t"}{"allocatable: "}{.status.allocatable.nvidia\.com/gpu}{"\n"}{end}' + exit 1 + fi + capacity=$(oc get nodes -l nvidia.com/gpu.present=true -o jsonpath='{.items[0].status.capacity.nvidia\.com/gpu}' 2>/dev/null || echo "0") + echo "GPU capacity: $capacity. Waiting... ($elapsed/$timeout seconds)" + sleep 10 + elapsed=$((elapsed + 10)) +done +echo "✅ GPU capacity available." + +# Display GPU node info +echo "GPU nodes ready:" +oc get nodes -l nvidia.com/gpu.present=true -o custom-columns=NAME:.metadata.name,GPU:.status.capacity.nvidia\\.com/gpu,INSTANCE:.metadata.labels.node\\.kubernetes\\.io/instance-type + +echo "Applying vLLM manifests..." + +envsubst < "$BASE_DIR/manifests/vllm/vllm-runtime-gpu.yaml" | oc apply -f - + +# Wait a moment for the ServingRuntime to be fully persisted before creating the InferenceService +echo "Waiting for ServingRuntime to be ready..." +sleep 5 -oc apply -f "$BASE_DIR/manifests/vllm/inference-service.yaml" \ No newline at end of file +oc apply -f "$BASE_DIR/manifests/vllm/vllm-inference-service-gpu.yaml" \ No newline at end of file diff --git a/tests/e2e-prow/rhoai/scripts/fetch-vllm-image.sh b/tests/e2e-prow/rhoai/scripts/fetch-vllm-image.sh new file mode 100755 index 000000000..2d1be03a7 --- /dev/null +++ b/tests/e2e-prow/rhoai/scripts/fetch-vllm-image.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +# Get vLLM CUDA image from RHOAI ServingRuntime template +echo "Fetching vLLM CUDA image from RHOAI..." +# Wait for RHOAI templates to be available (up to 20 minutes for first check) +timeout=1200 +elapsed=0 +until oc get template vllm-cuda-runtime-template -n redhat-ods-applications &>/dev/null; do + if [ $elapsed -ge $timeout ]; then + echo "❌ Timeout waiting for RHOAI templates (waited $timeout seconds)" + exit 1 + fi + echo " -> Waiting for RHOAI templates... ($elapsed/$timeout seconds)" + sleep 10 + elapsed=$((elapsed + 10)) +done + +# Extract vLLM image from the template +VLLM_IMAGE=$(oc get template vllm-cuda-runtime-template -n redhat-ods-applications -o jsonpath='{.objects[0].spec.containers[0].image}' 2>/dev/null || echo "") + +# Fallback: check existing ServingRuntimes for vLLM image +if [ -z "$VLLM_IMAGE" ]; then + echo " -> Template not found, checking existing ServingRuntimes..." + # Get all serving runtimes and filter for vLLM ones + VLLM_IMAGE=$(oc get servingruntime -A -o jsonpath='{range .items[*]}{.metadata.name}{","}{.spec.containers[0].image}{"\n"}{end}' 2>/dev/null | grep -i vllm | cut -d',' -f2 | grep 'odh-vllm-cuda-rhel9' | head -1 || echo "") +fi + +# Fallback: use default if still not found +if [ -z "$VLLM_IMAGE" ]; then + echo " -> Could not find vLLM image dynamically, using fallback..." + VLLM_IMAGE="registry.redhat.io/rhoai/odh-vllm-cuda-rhel9@sha256:5b86924790aeb996a7e3b7f9f4c8a3a676a83cd1d7484ae584101722d362c69b" +fi +echo " -> Found vLLM image: $VLLM_IMAGE" + +# Export images as environment variables for manifest substitution +export VLLM_IMAGE diff --git a/tests/e2e-prow/rhoai/scripts/get-vllm-pod-info.sh b/tests/e2e-prow/rhoai/scripts/get-vllm-pod-info.sh index ac693a47f..53ab1a03f 100755 --- a/tests/e2e-prow/rhoai/scripts/get-vllm-pod-info.sh +++ b/tests/e2e-prow/rhoai/scripts/get-vllm-pod-info.sh @@ -1,5 +1,4 @@ #!/bin/bash -set -e NAMESPACE="e2e-rhoai-dsc" ISVC_NAME="${1:-vllm-model}" @@ -11,34 +10,78 @@ echo "--> Finding the pod for InferenceService '$ISVC_NAME'..." # Find the running pod for the InferenceService POD_NAME="" -TIMEOUT=240 # seconds -INTERVAL=5 # check interval +CURRENT_POD="" +CURRENT_STATUS="" +TIMEOUT=580 +INTERVAL=20 ELAPSED=0 until [ -n "$POD_NAME" ] || [ $ELAPSED -ge $TIMEOUT ]; do + # Get the pod name regardless of status for visibility + CURRENT_POD=$(oc get pods -n "$NAMESPACE" \ + -l "serving.kserve.io/inferenceservice=$ISVC_NAME" \ + -o jsonpath="{.items[0].metadata.name}" 2>/dev/null) + + # Get the pod status + CURRENT_STATUS=$(oc get pods -n "$NAMESPACE" \ + -l "serving.kserve.io/inferenceservice=$ISVC_NAME" \ + -o jsonpath="{.items[0].status.phase}" 2>/dev/null) + + # Check if a running pod exists POD_NAME=$(oc get pods -n "$NAMESPACE" \ -l "serving.kserve.io/inferenceservice=$ISVC_NAME" \ -o jsonpath="{.items[?(@.status.phase=='Running')].metadata.name}" 2>/dev/null) - echo "Waiting for pod $POD_NAME in namespace $NAMESPACE" + + if [ -n "$CURRENT_POD" ]; then + echo "Waiting for pod $CURRENT_POD in namespace $NAMESPACE (current status: ${CURRENT_STATUS:-Unknown})" + # Show more debug info if pod exists but isn't Running + if [ -z "$POD_NAME" ] && [ $((ELAPSED % 60)) -eq 0 ]; then + echo " DEBUG: Pod details:" + oc get pod "$CURRENT_POD" -n "$NAMESPACE" -o wide || true + echo " DEBUG: Pod events:" + oc get events -n "$NAMESPACE" --field-selector involvedObject.name="$CURRENT_POD" --sort-by='.lastTimestamp' | tail -5 || true + fi + else + echo "Waiting for pod with label serving.kserve.io/inferenceservice=$ISVC_NAME in namespace $NAMESPACE (no pod found yet)" + # Show InferenceService status if no pod found + if [ $((ELAPSED % 60)) -eq 0 ]; then + echo " DEBUG: InferenceService status:" + oc get inferenceservice "$ISVC_NAME" -n "$NAMESPACE" -o jsonpath='{.status.conditions}' || true + echo "" + echo " DEBUG: All pods in namespace:" + oc get pods -n "$NAMESPACE" || true + fi + fi if [ -z "$POD_NAME" ]; then - echo " -> Pod not running yet, waiting $INTERVAL seconds..." + echo " -> Pod not running yet, waiting $INTERVAL seconds... ($ELAPSED/$TIMEOUT)" sleep $INTERVAL ELAPSED=$((ELAPSED + INTERVAL)) fi done -oc describe pod $POD_NAME -n $NAMESPACE || true -oc logs $POD_NAME -n $NAMESPACE || true - -POD_NAME=$(oc get pods -n $NAMESPACE -o jsonpath='{.items[0].metadata.name}') - +# Exit immediately if no running pod was found if [ -z "$POD_NAME" ]; then - echo " -> Timeout reached after $TIMEOUT seconds. Pod is not running." -else - echo " -> Pod is running: $POD_NAME" + echo "" + echo "❌ Timeout reached after $TIMEOUT seconds. Pod is not running." + echo "" + echo "DEBUG: InferenceService status:" + oc describe inferenceservice "$ISVC_NAME" -n "$NAMESPACE" || true + echo "" + echo "DEBUG: All pods in namespace:" + oc get pods -n "$NAMESPACE" -o wide || true + echo "" + echo "DEBUG: Recent events:" + oc get events -n "$NAMESPACE" --sort-by='.lastTimestamp' | tail -20 || true + exit 1 fi +echo " -> Pod is running: $POD_NAME" + +# Show pod details +oc describe pod "$POD_NAME" -n "$NAMESPACE" || true +oc logs "$POD_NAME" -n "$NAMESPACE" --tail=50 || true + # Get the 'app' label for Service selector APP_LABEL=$(oc get pod "$POD_NAME" -n "$NAMESPACE" -o jsonpath='{.metadata.labels.app}') if [ -z "$APP_LABEL" ]; then @@ -47,13 +90,34 @@ if [ -z "$APP_LABEL" ]; then fi echo " -> Found 'app' label: $APP_LABEL" -# Get the Knative Service URL -KSVC_URL=$(oc get ksvc "$KSVC_NAME" -n "$NAMESPACE" -o jsonpath='{.status.url}') -if [ -z "$KSVC_URL" ]; then - echo "Error: Could not retrieve Knative URL for $KSVC_NAME" - exit 1 +# Check if this is RawDeployment mode (standard K8s Service) or Serverless (Knative Service) +if oc get ksvc "$KSVC_NAME" -n "$NAMESPACE" &>/dev/null; then + # Serverless mode - get Knative Service URL + KSVC_URL=$(oc get ksvc "$KSVC_NAME" -n "$NAMESPACE" -o jsonpath='{.status.url}') + echo " -> Found Knative Service URL: $KSVC_URL" +else + # RawDeployment mode - construct URL from standard K8s Service + echo " -> RawDeployment mode detected, looking for standard Kubernetes Service..." + SERVICE_NAME="${ISVC_NAME}-predictor" + + # Check if the service exists + if ! oc get service "$SERVICE_NAME" -n "$NAMESPACE" &>/dev/null; then + echo "Error: Could not find Service $SERVICE_NAME" + exit 1 + fi + + # Get the cluster IP and targetPort (the actual container port) + CLUSTER_IP=$(oc get service "$SERVICE_NAME" -n "$NAMESPACE" -o jsonpath='{.spec.clusterIP}') + SERVICE_PORT=$(oc get service "$SERVICE_NAME" -n "$NAMESPACE" -o jsonpath='{.spec.ports[0].port}') + TARGET_PORT=$(oc get service "$SERVICE_NAME" -n "$NAMESPACE" -o jsonpath='{.spec.ports[0].targetPort}') + + # Use targetPort (container port) instead of service port for RawDeployment + PORT=${TARGET_PORT:-$SERVICE_PORT} + + # Construct internal cluster URL + KSVC_URL="http://${SERVICE_NAME}.${NAMESPACE}.svc.cluster.local:${PORT}" + echo " -> Found Service URL: $KSVC_URL (Cluster IP: $CLUSTER_IP, Service Port: $SERVICE_PORT, Target Port: $TARGET_PORT)" fi -echo " -> Found Knative URL: $KSVC_URL" # Save all info to pod.env cat < "$ENV_FILE" diff --git a/tests/e2e-prow/rhoai/scripts/gpu-setup.sh b/tests/e2e-prow/rhoai/scripts/gpu-setup.sh new file mode 100755 index 000000000..d72d744bb --- /dev/null +++ b/tests/e2e-prow/rhoai/scripts/gpu-setup.sh @@ -0,0 +1,208 @@ +#!/bin/bash + +set -euo pipefail + +BASE_DIR="$1" + +echo "Setting up GPU support..." + +# Debug: Show all nodes and their instance types +echo "" +echo "--> DEBUG: Cluster nodes before GPU setup..." +oc get nodes -o custom-columns=NAME:.metadata.name,INSTANCE:.metadata.labels.node\\.kubernetes\\.io/instance-type,STATUS:.status.conditions[-1].type + +# Debug: Check for GPU instance types and taints +echo "" +echo "--> DEBUG: Checking for GPU nodes and taints..." +gpu_nodes=$(oc get nodes -o jsonpath='{range .items[*]}{.metadata.name}{","}{.metadata.labels.node\.kubernetes\.io/instance-type}{"\n"}{end}' | grep -E "g4dn|p3|p4|g5" | cut -d',' -f1 || echo "") + +if [ -n "$gpu_nodes" ]; then + echo " Found GPU instance types:" + for node in $gpu_nodes; do + echo " Node: $node" + echo " Instance Type: $(oc get node $node -o jsonpath='{.metadata.labels.node\.kubernetes\.io/instance-type}')" + echo " Taints:" + oc get node $node -o jsonpath='{.spec.taints}' || echo " No taints" + echo "" + done +else + echo " No GPU instance types found (g4dn, p3, p4, g5)" + echo " All node instance types:" + oc get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.metadata.labels.node\.kubernetes\.io/instance-type}{"\n"}{end}' +fi + +# Apply NFD instance +echo "" +echo "--> Applying NFD instance..." +oc apply -f "$BASE_DIR/manifests/gpu/create-nfd.yaml" + +# Wait for NFD worker daemonset to be created +echo "--> Waiting for NFD worker daemonset to be created..." +timeout=60 +elapsed=0 +until oc get daemonset nfd-worker -n openshift-nfd &>/dev/null; do + if [ $elapsed -ge $timeout ]; then + echo "❌ Timeout waiting for NFD worker daemonset" + exit 1 + fi + echo " Waiting for nfd-worker daemonset... ($elapsed/$timeout seconds)" + sleep 5 + elapsed=$((elapsed + 5)) +done +echo "✅ NFD worker daemonset created" + +# Patch NFD worker daemonset to add GPU node tolerations +# This is needed in the prow env to be assigned a GPU +echo "--> Patching NFD worker daemonset with GPU tolerations..." +oc patch daemonset nfd-worker -n openshift-nfd --type=json -p='[ + { + "op": "add", + "path": "/spec/template/spec/tolerations", + "value": [ + { + "key": "nvidia.com/gpu", + "operator": "Exists", + "effect": "NoSchedule" + }, + { + "key": "gpu", + "operator": "Exists", + "effect": "NoSchedule" + } + ] + } +]' +echo "✅ NFD worker tolerations added" + +# Apply ClusterPolicy +echo "" +echo "--> Applying ClusterPolicy..." +oc apply -f "$BASE_DIR/manifests/gpu/cluster-policy.yaml" + +# Wait for GPU operator pods to be created and healthy +echo "" +echo "--> Waiting for GPU operator pods to be healthy..." +echo " This may take up to 10 minutes while images are pulled and pods start..." +timeout=1200 +elapsed=0 +until oc get pods -n nvidia-gpu-operator --no-headers 2>/dev/null | awk '{if ($3 != "Running" && $3 != "Completed") exit 1}' && [ $(oc get pods -n nvidia-gpu-operator --no-headers 2>/dev/null | wc -l) -gt 5 ]; do + if [ $elapsed -ge $timeout ]; then + echo "❌ Timeout waiting for GPU operator pods to be healthy" + echo "Current pod status:" + oc get pods -n nvidia-gpu-operator + echo "" + echo "DEBUG: Checking for scheduling issues..." + oc get pods -n nvidia-gpu-operator -o wide + echo "" + echo "DEBUG: Checking pod events for failures..." + oc get events -n nvidia-gpu-operator --sort-by='.lastTimestamp' | tail -20 + exit 1 + fi + pod_count=$(oc get pods -n nvidia-gpu-operator --no-headers 2>/dev/null | wc -l || echo 0) + failed_pods=$(oc get pods -n nvidia-gpu-operator --no-headers 2>/dev/null | awk '{if ($3 != "Running" && $3 != "Completed") print $1}' | wc -l || echo 0) + echo " Pods: $pod_count total, $failed_pods not ready. Waiting... ($elapsed/$timeout seconds)" + + # Show additional debug info every 60 seconds + if [ $((elapsed % 60)) -eq 0 ] && [ $elapsed -gt 0 ]; then + echo " DEBUG: Current pod statuses:" + oc get pods -n nvidia-gpu-operator -o wide + fi + + sleep 15 + elapsed=$((elapsed + 15)) +done +echo "✅ All GPU operator pods are healthy" + +# Debug: Show what pods are running +echo "" +echo "--> DEBUG: GPU operator pods deployed:" +oc get pods -n nvidia-gpu-operator -o wide + +# Wait for GPU nodes to be labeled by NFD +echo "" +echo "--> Waiting for GPU nodes to be labeled by NFD..." +timeout=120 +elapsed=0 +until oc get nodes -l nvidia.com/gpu.present=true --no-headers 2>/dev/null | grep -q .; do + if [ $elapsed -ge $timeout ]; then + echo "❌ Timeout waiting for GPU nodes to be labeled" + echo "" + echo "DEBUG: Checking why nodes aren't labeled..." + echo "All node labels related to features:" + oc get nodes --show-labels | grep -E "feature|gpu|nvidia" || echo "No GPU/feature labels found on any nodes" + echo "" + echo "DEBUG: NFD worker pods status:" + oc get pods -n openshift-nfd -o wide + echo "" + echo "DEBUG: Recent NFD events:" + oc get events -n openshift-nfd --sort-by='.lastTimestamp' | tail -10 + exit 1 + fi + echo " No GPU nodes found yet. Waiting... ($elapsed/$timeout seconds)" + + # Show debug info every 30 seconds + if [ $((elapsed % 30)) -eq 0 ] && [ $elapsed -gt 0 ]; then + echo " DEBUG: Checking NFD worker pods..." + oc get pods -n openshift-nfd --no-headers + fi + + sleep 10 + elapsed=$((elapsed + 10)) +done +echo "✅ GPU nodes detected" + +# Debug: Show labeled nodes +echo "" +echo "--> DEBUG: GPU nodes labeled:" +oc get nodes -l nvidia.com/gpu.present=true -o custom-columns=NAME:.metadata.name,INSTANCE:.metadata.labels.node\\.kubernetes\\.io/instance-type,TAINTS:.spec.taints + +# Wait for GPU capacity AND allocatable to become available +echo "--> Waiting for GPU capacity and allocatable to be available on nodes..." +timeout=120 +elapsed=0 +until [ "$(oc get nodes -l nvidia.com/gpu.present=true -o jsonpath='{.items[0].status.capacity.nvidia\.com/gpu}' 2>/dev/null)" != "" ] && \ + [ "$(oc get nodes -l nvidia.com/gpu.present=true -o jsonpath='{.items[0].status.capacity.nvidia\.com/gpu}' 2>/dev/null)" != "0" ]; do + if [ $elapsed -ge $timeout ]; then + echo "❌ Timeout waiting for GPU capacity/allocatable" + echo "" + echo "DEBUG: Investigating why GPU capacity is not appearing..." + echo "Device plugin pods:" + oc get pods -n nvidia-gpu-operator -l app=nvidia-device-plugin-daemonset -o wide + echo "" + echo "Device plugin pod logs (last 20 lines):" + device_plugin_pod=$(oc get pods -n nvidia-gpu-operator -l app=nvidia-device-plugin-daemonset -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") + if [ -n "$device_plugin_pod" ]; then + oc logs -n nvidia-gpu-operator "$device_plugin_pod" --tail=20 || echo "Could not fetch logs" + else + echo "No device plugin pod found - checking for scheduling issues..." + oc get events -n nvidia-gpu-operator --field-selector involvedObject.name=nvidia-device-plugin-daemonset --sort-by='.lastTimestamp' | tail -10 + fi + echo "" + echo "Node GPU capacity details:" + oc get nodes -l nvidia.com/gpu.present=true -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{"capacity: "}{.status.capacity.nvidia\.com/gpu}{"\t"}{"allocatable: "}{.status.allocatable.nvidia\.com/gpu}{"\n"}{end}' + exit 1 + fi + capacity=$(oc get nodes -l nvidia.com/gpu.present=true -o jsonpath='{.items[0].status.capacity.nvidia\.com/gpu}' 2>/dev/null || echo "0") + allocatable=$(oc get nodes -l nvidia.com/gpu.present=true -o jsonpath='{.items[0].status.allocatable.nvidia\.com/gpu}' 2>/dev/null || echo "0") + echo " GPU capacity: $capacity, allocatable: $allocatable. Waiting for both > 0... ($elapsed/$timeout seconds)" + + # Show debug info every 30 seconds + if [ $((elapsed % 30)) -eq 0 ] && [ $elapsed -gt 0 ]; then + echo " DEBUG: Checking device plugin daemonset pods..." + oc get pods -n nvidia-gpu-operator -l app=nvidia-device-plugin-daemonset --no-headers + fi + + sleep 15 + elapsed=$((elapsed + 15)) +done + +echo "" +echo "✅ GPU setup complete!" +echo "" +echo "GPU Node Status:" +oc get nodes -l nvidia.com/gpu.present=true -o custom-columns=NAME:.metadata.name,GPU:.status.capacity.nvidia\\.com/gpu,ALLOCATABLE:.status.allocatable.nvidia\\.com/gpu,INSTANCE:.metadata.labels.node\\.kubernetes\\.io/instance-type + +echo "" +echo "ClusterPolicy Status:" +oc get clusterpolicy gpu-cluster-policy -o jsonpath='{.status.state}' +echo ""