From f642f8a7a044b8c2b28057b5836791aaf2ff197b Mon Sep 17 00:00:00 2001
From: are-ces <195810094+are-ces@users.noreply.github.com>
Date: Tue, 13 Jan 2026 09:34:20 +0100
Subject: [PATCH] Add GPU support for RHOAI e2e tests in prow

- Enabled GPU for vLLM model serving in the Prow e2e test environment.
- Add GPU node tolerations to ensure pods are scheduled on GPU enabled nodes.
- Use dynamic llama-stack image built from the LCS repository Containerfile.
- Use latest vLLM image from the Red Hat template.
---
 tests/e2e-prow/rhoai/configs/run.yaml         |  10 +-
 .../rhoai/manifests/gpu/cluster-policy.yaml   |  31 +++
 .../rhoai/manifests/gpu/create-nfd.yaml       |   8 +
 .../manifests/lightspeed/llama-stack.yaml     |   4 +-
 .../rhoai/manifests/namespaces/nfd.yaml       |   5 +
 .../manifests/namespaces/nvidia-operator.yaml |   5 +
 .../manifests/operators/operatorgroup.yaml    |  22 +-
 .../rhoai/manifests/operators/operators.yaml  |  27 ++-
 ...e.yaml => vllm-inference-service-cpu.yaml} |   0
 .../vllm/vllm-inference-service-gpu.yaml      |  25 +++
 .../manifests/vllm/vllm-runtime-gpu.yaml      |  82 +++++++
 tests/e2e-prow/rhoai/pipeline-services.sh     |   4 +-
 tests/e2e-prow/rhoai/pipeline-vllm.sh         |   4 +
 tests/e2e-prow/rhoai/pipeline.sh              |  92 ++++++--
 tests/e2e-prow/rhoai/scripts/bootstrap.sh     |  57 ++++-
 tests/e2e-prow/rhoai/scripts/deploy-vllm.sh   |  48 +++-
 .../rhoai/scripts/fetch-vllm-image.sh         |  36 +++
 .../rhoai/scripts/get-vllm-pod-info.sh        | 102 +++++++--
 tests/e2e-prow/rhoai/scripts/gpu-setup.sh     | 208 ++++++++++++++++++
 19 files changed, 722 insertions(+), 48 deletions(-)
 create mode 100644 tests/e2e-prow/rhoai/manifests/gpu/cluster-policy.yaml
 create mode 100644 tests/e2e-prow/rhoai/manifests/gpu/create-nfd.yaml
 create mode 100644 tests/e2e-prow/rhoai/manifests/namespaces/nfd.yaml
 create mode 100644 tests/e2e-prow/rhoai/manifests/namespaces/nvidia-operator.yaml
 rename tests/e2e-prow/rhoai/manifests/vllm/{inference-service.yaml => vllm-inference-service-cpu.yaml} (100%)
 create mode 100644 tests/e2e-prow/rhoai/manifests/vllm/vllm-inference-service-gpu.yaml
 create mode 100644 tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-gpu.yaml
 create mode 100755 tests/e2e-prow/rhoai/scripts/fetch-vllm-image.sh
 create mode 100755 tests/e2e-prow/rhoai/scripts/gpu-setup.sh

diff --git a/tests/e2e-prow/rhoai/configs/run.yaml b/tests/e2e-prow/rhoai/configs/run.yaml
index 645e88dc1..9e83a9fb4 100644
--- a/tests/e2e-prow/rhoai/configs/run.yaml
+++ b/tests/e2e-prow/rhoai/configs/run.yaml
@@ -36,10 +36,10 @@ providers:
       api_token: ${env.VLLM_API_KEY}
       tls_verify: false
       max_tokens: 1024
-  - provider_id: openai
-    provider_type: remote::openai
-    config:
-      api_key: ${env.OPENAI_API_KEY}
+  # - provider_id: openai
+  #   provider_type: remote::openai
+  #   config:
+  #     api_key: ${env.OPENAI_API_KEY}
   - config: {}
     provider_id: sentence-transformers
     provider_type: inline::sentence-transformers
@@ -144,7 +144,7 @@ registered_resources:
   shields:
   - shield_id: llama-guard
     provider_id: llama-guard
-    provider_shield_id: openai/gpt-4o-mini
+    provider_shield_id: vllm/meta-llama/Llama-3.2-1B-Instruct
   datasets: []
   scoring_fns: []
   benchmarks: []
diff --git a/tests/e2e-prow/rhoai/manifests/gpu/cluster-policy.yaml b/tests/e2e-prow/rhoai/manifests/gpu/cluster-policy.yaml
new file mode 100644
index 000000000..673a62ed4
--- /dev/null
+++ b/tests/e2e-prow/rhoai/manifests/gpu/cluster-policy.yaml
@@ -0,0 +1,31 @@
+apiVersion: nvidia.com/v1
+kind: ClusterPolicy
+metadata:
+  name: gpu-cluster-policy
+spec:
+  daemonsets:
+    tolerations:
+    - key: nvidia.com/gpu
+      operator: Exists
+      effect: NoSchedule
+    - key: gpu
+      operator: Exists
+      effect: NoSchedule
+  operator:
+    defaultRuntime: crio
+  driver:
+    enabled: true
+  toolkit:
+    enabled: true
+  devicePlugin:
+    enabled: true
+  dcgm:
+    enabled: true
+  dcgmExporter:
+    enabled: true
+  gfd:
+    enabled: true
+  migManager:
+    enabled: false
+  nodeStatusExporter:
+    enabled: true
\ No newline at end of file
diff --git a/tests/e2e-prow/rhoai/manifests/gpu/create-nfd.yaml b/tests/e2e-prow/rhoai/manifests/gpu/create-nfd.yaml
new file mode 100644
index 000000000..568a61e3c
--- /dev/null
+++ b/tests/e2e-prow/rhoai/manifests/gpu/create-nfd.yaml
@@ -0,0 +1,8 @@
+# Minimal NFD instance
+apiVersion: nfd.openshift.io/v1
+kind: NodeFeatureDiscovery
+metadata:
+  name: nfd-instance
+  namespace: openshift-nfd
+spec:
+  instance: ""
diff --git a/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack.yaml b/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack.yaml
index b228ab650..005f96978 100644
--- a/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack.yaml
+++ b/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack.yaml
@@ -4,6 +4,8 @@ metadata:
   name: llama-stack-service
   namespace: e2e-rhoai-dsc
 spec:
+  imagePullSecrets:
+    - name: quay-lightspeed-pull-secret
   containers:
     - name: llama-stack-container
       env:
@@ -17,7 +19,7 @@ spec:
             secretKeyRef:
               name: vllm-api-key-secret
               key: key
-      image: quay.io/opendatahub/llama-stack:rhoai-v2.25-latest
+      image: ${LLAMA_STACK_IMAGE}
       ports:
         - containerPort: 8321
       volumeMounts:
diff --git a/tests/e2e-prow/rhoai/manifests/namespaces/nfd.yaml b/tests/e2e-prow/rhoai/manifests/namespaces/nfd.yaml
new file mode 100644
index 000000000..b15d50c6f
--- /dev/null
+++ b/tests/e2e-prow/rhoai/manifests/namespaces/nfd.yaml
@@ -0,0 +1,5 @@
+# NFD Namespace
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: openshift-nfd
\ No newline at end of file
diff --git a/tests/e2e-prow/rhoai/manifests/namespaces/nvidia-operator.yaml b/tests/e2e-prow/rhoai/manifests/namespaces/nvidia-operator.yaml
new file mode 100644
index 000000000..81288f3a4
--- /dev/null
+++ b/tests/e2e-prow/rhoai/manifests/namespaces/nvidia-operator.yaml
@@ -0,0 +1,5 @@
+# NVIDIA GPU Operator Namespace
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: nvidia-gpu-operator
diff --git a/tests/e2e-prow/rhoai/manifests/operators/operatorgroup.yaml b/tests/e2e-prow/rhoai/manifests/operators/operatorgroup.yaml
index 7ed06cac5..dd6b119c0 100644
--- a/tests/e2e-prow/rhoai/manifests/operators/operatorgroup.yaml
+++ b/tests/e2e-prow/rhoai/manifests/operators/operatorgroup.yaml
@@ -3,4 +3,24 @@ kind: OperatorGroup
 metadata:
   name: global-operators
   namespace: openshift-operators
-spec: 
\ No newline at end of file
+spec: 
+---
+# NVIDIA GPU Operator OperatorGroup
+apiVersion: operators.coreos.com/v1
+kind: OperatorGroup
+metadata:
+  name: nvidia-gpu-operator-group
+  namespace: nvidia-gpu-operator
+spec:
+  targetNamespaces:
+  - nvidia-gpu-operator
+---
+# Create OperatorGroup
+apiVersion: operators.coreos.com/v1
+kind: OperatorGroup
+metadata:
+  name: nfd-operator-group
+  namespace: openshift-nfd
+spec:
+  targetNamespaces:
+  - openshift-nfd
\ No newline at end of file
diff --git a/tests/e2e-prow/rhoai/manifests/operators/operators.yaml b/tests/e2e-prow/rhoai/manifests/operators/operators.yaml
index 2da92a08e..9eb113b82 100644
--- a/tests/e2e-prow/rhoai/manifests/operators/operators.yaml
+++ b/tests/e2e-prow/rhoai/manifests/operators/operators.yaml
@@ -32,4 +32,29 @@ spec:
   channel: stable
   name: rhods-operator
   source: redhat-operators
-  sourceNamespace: openshift-marketplace
\ No newline at end of file
+  sourceNamespace: openshift-marketplace
+---
+# NVIDIA GPU Operator Subscription
+apiVersion: operators.coreos.com/v1alpha1
+kind: Subscription
+metadata:
+  name: gpu-operator-certified
+  namespace: nvidia-gpu-operator
+spec:
+  channel: stable
+  installPlanApproval: Automatic
+  name: gpu-operator-certified
+  source: certified-operators
+  sourceNamespace: openshift-marketplace
+---
+# Node Feature Discovery (NFD) Operator Subscription
+apiVersion: operators.coreos.com/v1alpha1
+kind: Subscription
+metadata:
+  name: nfd
+  namespace: openshift-nfd
+spec:
+  channel: stable
+  name: nfd
+  source: redhat-operators
+  sourceNamespace: openshift-marketplace
diff --git a/tests/e2e-prow/rhoai/manifests/vllm/inference-service.yaml b/tests/e2e-prow/rhoai/manifests/vllm/vllm-inference-service-cpu.yaml
similarity index 100%
rename from tests/e2e-prow/rhoai/manifests/vllm/inference-service.yaml
rename to tests/e2e-prow/rhoai/manifests/vllm/vllm-inference-service-cpu.yaml
diff --git a/tests/e2e-prow/rhoai/manifests/vllm/vllm-inference-service-gpu.yaml b/tests/e2e-prow/rhoai/manifests/vllm/vllm-inference-service-gpu.yaml
new file mode 100644
index 000000000..da2e3eb19
--- /dev/null
+++ b/tests/e2e-prow/rhoai/manifests/vllm/vllm-inference-service-gpu.yaml
@@ -0,0 +1,25 @@
+apiVersion: serving.kserve.io/v1beta1
+kind: InferenceService
+metadata:
+  name: vllm-model
+  namespace: e2e-rhoai-dsc
+  annotations:
+    serving.kserve.io/deploymentMode: RawDeployment
+    sidecar.istio.io/inject: "false"
+spec:
+  predictor:
+    minReplicas: 1
+    maxReplicas: 1
+    model:
+      modelFormat:
+        name: vLLM
+      runtime: vllm-gpu
+      resources:
+        limits:
+          nvidia.com/gpu: 1
+          cpu: "4"
+          memory: 20Gi
+        requests:
+          nvidia.com/gpu: 1
+          cpu: "2"
+          memory: 16Gi
diff --git a/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-gpu.yaml b/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-gpu.yaml
new file mode 100644
index 000000000..2027cfcf2
--- /dev/null
+++ b/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-gpu.yaml
@@ -0,0 +1,82 @@
+apiVersion: serving.kserve.io/v1alpha1
+kind: ServingRuntime
+metadata:
+  annotations:
+    openshift.io/display-name: vLLM GPU
+  name: vllm-gpu
+  namespace: e2e-rhoai-dsc
+  labels:
+    opendatahub.io/dashboard: "true"
+spec:
+  builtInAdapter:
+    modelLoadingTimeoutMillis: 90000
+  containers:
+    - args:
+        - --model
+        - meta-llama/Llama-3.2-1B-Instruct
+        - --enable-auto-tool-choice
+        - --tool-call-parser 
+        - llama3_json
+        - --chat-template 
+        - /mnt/chat-template/tool_chat_template_llama3.2_json.jinja
+        - --download-dir
+        - /tmp/models-cache
+        - --port
+        - "8080"
+        - --max-model-len
+        - "2048"
+        - --gpu-memory-utilization
+        - "0.9"
+      image: ${VLLM_IMAGE}
+      name: kserve-container
+      env:
+        - name: HF_HUB_OFFLINE
+          value: "false"
+        - name: TRANSFORMERS_OFFLINE
+          value: "false"
+        - name: HF_DATASETS_OFFLINE
+          value: "false"
+        - name: HF_HOME
+          value: /mnt/models-cache/hf_home
+        - name: HUGGING_FACE_HUB_TOKEN
+          valueFrom:
+            secretKeyRef:
+              name: hf-token-secret
+              key: token
+        - name: VLLM_API_KEY
+          valueFrom:
+            secretKeyRef:
+              name: vllm-api-key-secret
+              key: key
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      volumeMounts:
+        - name: chat-template
+          mountPath: /mnt/chat-template
+        - name: models-cache
+          mountPath: /mnt/models-cache
+        - name: vllm-cache
+          mountPath: /.cache
+      resources:
+        limits:
+          nvidia.com/gpu: 1
+          cpu: "6"
+          memory: 20Gi
+        requests:
+          nvidia.com/gpu: 1
+          cpu: "4"
+          memory: 16Gi
+  volumes:
+    - name: chat-template
+      configMap:
+        name: vllm-chat-template
+    - name: models-cache
+      emptyDir: {}
+    - name: vllm-cache
+      emptyDir: {}
+  multiModel: false
+  supportedModelFormats:
+    - autoSelect: true
+      name: vLLM
diff --git a/tests/e2e-prow/rhoai/pipeline-services.sh b/tests/e2e-prow/rhoai/pipeline-services.sh
index 832bff727..8d011bce7 100755
--- a/tests/e2e-prow/rhoai/pipeline-services.sh
+++ b/tests/e2e-prow/rhoai/pipeline-services.sh
@@ -2,10 +2,10 @@
 
 BASE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 
-oc apply -f "$BASE_DIR/manifests/lightspeed/llama-stack.yaml"
+envsubst < "$BASE_DIR/manifests/lightspeed/llama-stack.yaml" | oc apply -f -
 
 oc wait pod/llama-stack-service \
--n e2e-rhoai-dsc --for=condition=Ready --timeout=300s
+-n e2e-rhoai-dsc --for=condition=Ready --timeout=600s
 
 # Get url address of llama-stack pod
 oc label pod llama-stack-service pod=llama-stack-service -n e2e-rhoai-dsc
diff --git a/tests/e2e-prow/rhoai/pipeline-vllm.sh b/tests/e2e-prow/rhoai/pipeline-vllm.sh
index 20dedf752..47248442c 100755
--- a/tests/e2e-prow/rhoai/pipeline-vllm.sh
+++ b/tests/e2e-prow/rhoai/pipeline-vllm.sh
@@ -1,7 +1,11 @@
 #!/bin/bash
 
+set -euo pipefail
+
 PIPELINE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 
 "$PIPELINE_DIR/scripts/bootstrap.sh" "$PIPELINE_DIR"
+"$PIPELINE_DIR/scripts/gpu-setup.sh" "$PIPELINE_DIR"
+source "$PIPELINE_DIR/scripts/fetch-vllm-image.sh"
 "$PIPELINE_DIR/scripts/deploy-vllm.sh" "$PIPELINE_DIR"
 "$PIPELINE_DIR/scripts/get-vllm-pod-info.sh" 
\ No newline at end of file
diff --git a/tests/e2e-prow/rhoai/pipeline.sh b/tests/e2e-prow/rhoai/pipeline.sh
index 71f5a4910..718dc36ae 100755
--- a/tests/e2e-prow/rhoai/pipeline.sh
+++ b/tests/e2e-prow/rhoai/pipeline.sh
@@ -8,7 +8,17 @@ trap 'echo "❌ Pipeline failed at line $LINENO"; exit 1' ERR
 #========================================
 NAMESPACE="e2e-rhoai-dsc"
 MODEL_NAME="meta-llama/Llama-3.2-1B-Instruct"
-
+PIPELINE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# Get llama-stack image from GitHub Containerfile
+echo "Fetching llama-stack image from GitHub..."
+LLAMA_STACK_IMAGE=$(curl -sL https://raw.githubusercontent.com/lightspeed-core/lightspeed-stack/main/test.containerfile | grep -m1 '^FROM' | awk '{print $2}')
+if [ -z "$LLAMA_STACK_IMAGE" ]; then
+  echo "❌ Failed to fetch llama-stack image from GitHub"
+  exit 1
+fi
+echo "  -> Found llama-stack image: $LLAMA_STACK_IMAGE"
+export LLAMA_STACK_IMAGE
 
 #========================================
 # 2. ENVIRONMENT SETUP
@@ -16,9 +26,14 @@ MODEL_NAME="meta-llama/Llama-3.2-1B-Instruct"
 echo "===== Setting up environment variables ====="
 export HUGGING_FACE_HUB_TOKEN=$(cat /var/run/huggingface/hf-token-ces-lcore-test || true)
 export VLLM_API_KEY=$(cat /var/run/vllm/vllm-api-key-lcore-test || true)
+export QUAY_ROBOT_NAME=$(cat /var/run/quay-aipcc-name/lcore-quay-name-lcore-test || true)
+export QUAY_ROBOT_PASSWORD=$(cat /var/run/quay-aipcc-password/lcore-quay-password-lcore-test || true)
+
 
 [[ -n "$HUGGING_FACE_HUB_TOKEN" ]] && echo "✅ HUGGING_FACE_HUB_TOKEN is set" || { echo "❌ Missing HUGGING_FACE_HUB_TOKEN"; exit 1; }
 [[ -n "$VLLM_API_KEY" ]] && echo "✅ VLLM_API_KEY is set" || { echo "❌ Missing VLLM_API_KEY"; exit 1; }
+[[ -n "$QUAY_ROBOT_NAME" ]] && echo "✅ QUAY_ROBOT_NAME is set" || { echo "❌ Missing QUAY_ROBOT_NAME"; exit 1; }
+[[ -n "$QUAY_ROBOT_PASSWORD" ]] && echo "✅ QUAY_ROBOT_PASSWORD is set" || { echo "❌ Missing QUAY_ROBOT_PASSWORD"; exit 1; }
 
 # Basic info
 ls -A || true
@@ -31,6 +46,11 @@ oc whoami
 echo "===== Creating namespace & secrets ====="
 oc get ns "$NAMESPACE" >/dev/null 2>&1 || oc create namespace "$NAMESPACE"
 
+# Create NFD and NVIDIA namespaces
+oc apply -f "$PIPELINE_DIR/manifests/namespaces/nfd.yaml"
+oc apply -f "$PIPELINE_DIR/manifests/namespaces/nvidia-operator.yaml"
+
+
 create_secret() {
     local name=$1; shift
     echo "Creating secret $name..."
@@ -40,6 +60,17 @@ create_secret() {
 create_secret hf-token-secret --from-literal=token="$HUGGING_FACE_HUB_TOKEN"
 create_secret vllm-api-key-secret --from-literal=key="$VLLM_API_KEY"
 
+# Create Quay pull secret for llama-stack images
+echo "Creating Quay pull secret..."
+oc create secret docker-registry quay-lightspeed-pull-secret \
+  --docker-server=quay.io \
+  --docker-username="$QUAY_ROBOT_NAME" \
+  --docker-password="$QUAY_ROBOT_PASSWORD" \
+  -n "$NAMESPACE" 2>/dev/null && echo "✅ Quay pull secret created" || echo "⚠️  Secret exists or creation failed"
+
+# Link the secret to default service account for image pulls
+oc secrets link default quay-lightspeed-pull-secret --for=pull -n "$NAMESPACE" 2>/dev/null || echo "⚠️  Secret already linked to default SA"
+
 
 #========================================
 # 4. CONFIGMAPS
@@ -73,26 +104,33 @@ start_time=$(date +%s)
 timeout=200
 
 while true; do
-  response=$(curl -sk -w "%{http_code}" \
-      -H "Content-Type: application/json" \
+  # Create a temporary pod for testing (if it doesn't exist)
+  if ! oc get pod vllm-test-curl -n "$NAMESPACE" &>/dev/null; then
+    oc run vllm-test-curl --image=curlimages/curl:latest \
+      --restart=Never -n "$NAMESPACE" -- sleep 3600
+    oc wait --for=condition=Ready pod/vllm-test-curl -n "$NAMESPACE" --timeout=60s
+  fi
+
+  # Execute curl inside the pod and capture response
+  response=$(oc exec vllm-test-curl -n "$NAMESPACE" -- \
+      curl -sk -w '\n%{http_code}' \
+      -H 'Content-Type: application/json' \
       -H "Authorization: Bearer $VLLM_API_KEY" \
       -d "{
           \"model\": \"$MODEL_NAME\",
           \"prompt\": \"Who won the world series in 2020?\",
           \"max_new_tokens\": 100
           }" \
-      "$KSVC_URL/v1/completions")
+      "$KSVC_URL/v1/completions" 2>&1 || echo -e "\n000")
 
-  if [[ ${#response} -ge 3 ]]; then
-    http_code="${response: -3}"
-    body="${response:0:${#response}-3}"
-  else
-    http_code="000"
-    body="$response"
-  fi
+  # Extract HTTP code from last line
+  http_code=$(echo "$response" | tail -1 | tr -d '[:space:]')
+  # Extract body from all lines except last
+  body=$(echo "$response" | sed '$d')
 
   if [[ "$http_code" == "200" && "$body" == *'"object":"text_completion"'* ]]; then
     echo "✅ API test passed."
+    echo "$body" | jq . 2>/dev/null || echo "$body"
     break
   else
     echo "❌ API test failed (HTTP $http_code)"
@@ -104,12 +142,16 @@ while true; do
 
   if (( elapsed >= timeout )); then
       echo "⏰ Timeout reached ($timeout seconds). Stopping test."
+      oc delete pod vllm-test-curl -n "$NAMESPACE" --ignore-not-found=true
       exit 1
   fi
 
   sleep 20
 done
 
+# Cleanup test pod
+oc delete pod vllm-test-curl -n "$NAMESPACE" --ignore-not-found=true
+
 
 #========================================
 # 7. DEPLOY LIGHTSPEED STACK AND LLAMA STACK
@@ -123,8 +165,32 @@ oc create configmap test-script-cm -n "$NAMESPACE" --from-file=run-tests.sh
 
 ./pipeline-services.sh
 
-oc wait pod/lightspeed-stack-service pod/llama-stack-service \
-    -n "$NAMESPACE" --for=condition=Ready --timeout=300s
+echo "--> Final wait for both lightspeed-stack-service and llama-stack-service pods..."
+if ! oc wait pod/lightspeed-stack-service pod/llama-stack-service \
+    -n "$NAMESPACE" --for=condition=Ready --timeout=600s; then
+  echo ""
+  echo "❌ One or both service pods failed to become ready within timeout"
+  echo ""
+  echo "DEBUG: Pod status:"
+  oc get pods -n "$NAMESPACE" -o wide || true
+  echo ""
+  echo "DEBUG: lightspeed-stack-service description:"
+  oc describe pod lightspeed-stack-service -n "$NAMESPACE" || true
+  echo ""
+  echo "DEBUG: llama-stack-service description:"
+  oc describe pod llama-stack-service -n "$NAMESPACE" || true
+  echo ""
+  echo "DEBUG: lightspeed-stack-service logs:"
+  oc logs lightspeed-stack-service -n "$NAMESPACE" --tail=100 || true
+  echo ""
+  echo "DEBUG: llama-stack-service logs:"
+  oc logs llama-stack-service -n "$NAMESPACE" --tail=100 || true
+  echo ""
+  echo "DEBUG: Recent events in namespace:"
+  oc get events -n "$NAMESPACE" --sort-by='.lastTimestamp' | tail -20 || true
+  exit 1
+fi
+echo "✅ Both service pods are ready"
 sleep 30
 
 oc get pods -n "$NAMESPACE"
diff --git a/tests/e2e-prow/rhoai/scripts/bootstrap.sh b/tests/e2e-prow/rhoai/scripts/bootstrap.sh
index 7a40ca56e..1718b70e5 100755
--- a/tests/e2e-prow/rhoai/scripts/bootstrap.sh
+++ b/tests/e2e-prow/rhoai/scripts/bootstrap.sh
@@ -20,9 +20,29 @@ wait_for_operator() {
 }
 
 # APPLY OPERATOR SUBSCRIPTIONS
+echo "--> Applying OperatorGroups from operatorgroup.yaml..."
+oc apply -f "$BASE_DIR/manifests/operators/operatorgroup.yaml"
+
+sleep 10
+
 echo "--> Applying Operator Subscriptions from operators.yaml..."
 oc apply -f "$BASE_DIR/manifests/operators/operators.yaml"
 
+sleep 10
+
+# WAIT FOR GPU OPERATOR NAMESPACE AND OPERATORGROUP
+echo "--> Ensuring GPU Operator namespace and OperatorGroup are ready..."
+oc wait --for=jsonpath='{.status.phase}'=Active namespace/nvidia-gpu-operator --timeout=60s
+echo "  -> Waiting for GPU OperatorGroup to be created..."
+until oc get operatorgroup nvidia-gpu-operator-group -n nvidia-gpu-operator &>/dev/null; do
+  echo "     ...still waiting for OperatorGroup"
+  sleep 2
+done
+echo "  -> GPU OperatorGroup ready"
+
+# Give OLM a moment to process the OperatorGroup before checking subscriptions
+sleep 5
+
 # WAIT FOR OPERATORS TO BECOME READY
 echo "--> Waiting for Operators to be installed. This can take several minutes..."
 
@@ -33,13 +53,42 @@ wait_for_operator "operators.coreos.com/servicemeshoperator.openshift-operators"
 wait_for_operator "operators.coreos.com/serverless-operator.openshift-operators" "openshift-operators" "Serverless Operator"
 wait_for_operator "operators.coreos.com/rhods-operator.openshift-operators" "openshift-operators" "RHODS Operator"
 
+# Verify GPU operator InstallPlan was created before waiting for CSV
+echo "  -> Verifying GPU Operator InstallPlan was created..."
+timeout=120
+elapsed=0
+until oc get installplan -n nvidia-gpu-operator --no-headers 2>/dev/null | grep -q .; do
+  if [ $elapsed -ge $timeout ]; then
+    echo "     ❌ No InstallPlan created for GPU Operator - this is an OLM issue"
+    echo "     Attempting to fix by recreating subscription..."
+    oc delete subscription gpu-operator-certified -n nvidia-gpu-operator
+    sleep 5
+    oc apply -f "$BASE_DIR/manifests/operators/operators.yaml"
+    sleep 10
+    # Try one more time
+    if ! oc get installplan -n nvidia-gpu-operator --no-headers 2>/dev/null | grep -q .; then
+      echo "     ❌ Still no InstallPlan - manual intervention required"
+      exit 1
+    fi
+    break
+  fi
+  echo "     ...waiting for InstallPlan ($elapsed/$timeout seconds)"
+  sleep 5
+  elapsed=$((elapsed + 5))
+done
+echo "  -> InstallPlan created successfully"
+
+wait_for_operator "operators.coreos.com/gpu-operator-certified.nvidia-gpu-operator" "nvidia-gpu-operator" "GPU Operator"
+wait_for_operator "operators.coreos.com/nfd.openshift-nfd" "openshift-nfd" "NFD Operator"
+
+echo "  -> Waiting for NFD CRD to be established..."
+oc wait --for=condition=established --timeout=300s crd/nodefeaturediscoveries.nfd.openshift.io
+
 echo "--> All operators are ready."
 
 oc get csv -n openshift-operators
-
-# APPLY DEPENDENT RESOURCES
-echo "--> Applying OperatorGroup from operatorgroup.yaml..."
-oc apply -f "$BASE_DIR/manifests/operators/operatorgroup.yaml"
+oc get csv -n nvidia-gpu-operator
+oc get csv -n openshift-nfd
 
 echo "--> Applying DataScienceCluster from ds-cluster.yaml..."
 oc apply -f "$BASE_DIR/manifests/operators/ds-cluster.yaml"
diff --git a/tests/e2e-prow/rhoai/scripts/deploy-vllm.sh b/tests/e2e-prow/rhoai/scripts/deploy-vllm.sh
index 4d31e663c..5c3201fa5 100755
--- a/tests/e2e-prow/rhoai/scripts/deploy-vllm.sh
+++ b/tests/e2e-prow/rhoai/scripts/deploy-vllm.sh
@@ -29,6 +29,50 @@ until oc get endpoints kserve-webhook-server-service -n redhat-ods-applications
 done
 echo "✅ KServe webhook service is ready."
 
-oc apply -f "$BASE_DIR/manifests/vllm/vllm-runtime-cpu.yaml"
+# Wait for GPU nodes to be labeled by NFD
+echo "Waiting for GPU nodes to be labeled by NFD..."
+timeout=600  # 10 minutes
+elapsed=0
+until oc get nodes -l nvidia.com/gpu.present=true --no-headers 2>/dev/null | grep -q .; do
+  if [ $elapsed -ge $timeout ]; then
+    echo "❌ Timeout waiting for GPU nodes to be labeled"
+    exit 1
+  fi
+  echo "No GPU nodes found yet. Waiting... ($elapsed/$timeout seconds)"
+  sleep 10
+  elapsed=$((elapsed + 10))
+done
+echo "✅ GPU nodes detected."
+
+# Wait for GPU capacity to be available
+echo "Waiting for GPU capacity to be available on nodes..."
+timeout=600  # 10 minutes
+elapsed=0
+until [ "$(oc get nodes -l nvidia.com/gpu.present=true -o jsonpath='{.items[0].status.capacity.nvidia\.com/gpu}' 2>/dev/null)" != "" ] && \
+      [ "$(oc get nodes -l nvidia.com/gpu.present=true -o jsonpath='{.items[0].status.capacity.nvidia\.com/gpu}' 2>/dev/null)" != "0" ]; do
+  if [ $elapsed -ge $timeout ]; then
+    echo "❌ Timeout waiting for GPU capacity"
+    echo "DEBUG: Checking GPU status..."
+    oc get nodes -l nvidia.com/gpu.present=true -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{"capacity: "}{.status.capacity.nvidia\.com/gpu}{"\t"}{"allocatable: "}{.status.allocatable.nvidia\.com/gpu}{"\n"}{end}'
+    exit 1
+  fi
+  capacity=$(oc get nodes -l nvidia.com/gpu.present=true -o jsonpath='{.items[0].status.capacity.nvidia\.com/gpu}' 2>/dev/null || echo "0")
+  echo "GPU capacity: $capacity. Waiting... ($elapsed/$timeout seconds)"
+  sleep 10
+  elapsed=$((elapsed + 10))
+done
+echo "✅ GPU capacity available."
+
+# Display GPU node info
+echo "GPU nodes ready:"
+oc get nodes -l nvidia.com/gpu.present=true -o custom-columns=NAME:.metadata.name,GPU:.status.capacity.nvidia\\.com/gpu,INSTANCE:.metadata.labels.node\\.kubernetes\\.io/instance-type
+
+echo "Applying vLLM manifests..."
+
+envsubst < "$BASE_DIR/manifests/vllm/vllm-runtime-gpu.yaml" | oc apply -f -
+
+# Wait a moment for the ServingRuntime to be fully persisted before creating the InferenceService
+echo "Waiting for ServingRuntime to be ready..."
+sleep 5
 
-oc apply -f "$BASE_DIR/manifests/vllm/inference-service.yaml"
\ No newline at end of file
+oc apply -f "$BASE_DIR/manifests/vllm/vllm-inference-service-gpu.yaml"
\ No newline at end of file
diff --git a/tests/e2e-prow/rhoai/scripts/fetch-vllm-image.sh b/tests/e2e-prow/rhoai/scripts/fetch-vllm-image.sh
new file mode 100755
index 000000000..2d1be03a7
--- /dev/null
+++ b/tests/e2e-prow/rhoai/scripts/fetch-vllm-image.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Get vLLM CUDA image from RHOAI ServingRuntime template
+echo "Fetching vLLM CUDA image from RHOAI..."
+# Wait for RHOAI templates to be available (up to 20 minutes for first check)
+timeout=1200
+elapsed=0
+until oc get template vllm-cuda-runtime-template -n redhat-ods-applications &>/dev/null; do
+  if [ $elapsed -ge $timeout ]; then
+    echo "❌ Timeout waiting for RHOAI templates (waited $timeout seconds)"
+    exit 1
+  fi
+  echo "  -> Waiting for RHOAI templates... ($elapsed/$timeout seconds)"
+  sleep 10
+  elapsed=$((elapsed + 10))
+done
+
+# Extract vLLM image from the template
+VLLM_IMAGE=$(oc get template vllm-cuda-runtime-template -n redhat-ods-applications -o jsonpath='{.objects[0].spec.containers[0].image}' 2>/dev/null || echo "")
+
+# Fallback: check existing ServingRuntimes for vLLM image
+if [ -z "$VLLM_IMAGE" ]; then
+  echo "  -> Template not found, checking existing ServingRuntimes..."
+  # Get all serving runtimes and filter for vLLM ones
+  VLLM_IMAGE=$(oc get servingruntime -A -o jsonpath='{range .items[*]}{.metadata.name}{","}{.spec.containers[0].image}{"\n"}{end}' 2>/dev/null | grep -i vllm | cut -d',' -f2 | grep 'odh-vllm-cuda-rhel9' | head -1 || echo "")
+fi
+
+# Fallback: use default if still not found
+if [ -z "$VLLM_IMAGE" ]; then
+  echo "  -> Could not find vLLM image dynamically, using fallback..."
+  VLLM_IMAGE="registry.redhat.io/rhoai/odh-vllm-cuda-rhel9@sha256:5b86924790aeb996a7e3b7f9f4c8a3a676a83cd1d7484ae584101722d362c69b"
+fi
+echo "  -> Found vLLM image: $VLLM_IMAGE"
+
+# Export images as environment variables for manifest substitution
+export VLLM_IMAGE
diff --git a/tests/e2e-prow/rhoai/scripts/get-vllm-pod-info.sh b/tests/e2e-prow/rhoai/scripts/get-vllm-pod-info.sh
index ac693a47f..53ab1a03f 100755
--- a/tests/e2e-prow/rhoai/scripts/get-vllm-pod-info.sh
+++ b/tests/e2e-prow/rhoai/scripts/get-vllm-pod-info.sh
@@ -1,5 +1,4 @@
 #!/bin/bash
-set -e
 
 NAMESPACE="e2e-rhoai-dsc"
 ISVC_NAME="${1:-vllm-model}"
@@ -11,34 +10,78 @@ echo "--> Finding the pod for InferenceService '$ISVC_NAME'..."
 
 # Find the running pod for the InferenceService
 POD_NAME=""
-TIMEOUT=240  # seconds
-INTERVAL=5   # check interval
+CURRENT_POD=""
+CURRENT_STATUS=""
+TIMEOUT=580
+INTERVAL=20
 ELAPSED=0
 
 until [ -n "$POD_NAME" ] || [ $ELAPSED -ge $TIMEOUT ]; do
+  # Get the pod name regardless of status for visibility
+  CURRENT_POD=$(oc get pods -n "$NAMESPACE" \
+    -l "serving.kserve.io/inferenceservice=$ISVC_NAME" \
+    -o jsonpath="{.items[0].metadata.name}" 2>/dev/null)
+
+  # Get the pod status
+  CURRENT_STATUS=$(oc get pods -n "$NAMESPACE" \
+    -l "serving.kserve.io/inferenceservice=$ISVC_NAME" \
+    -o jsonpath="{.items[0].status.phase}" 2>/dev/null)
+
+  # Check if a running pod exists
   POD_NAME=$(oc get pods -n "$NAMESPACE" \
     -l "serving.kserve.io/inferenceservice=$ISVC_NAME" \
     -o jsonpath="{.items[?(@.status.phase=='Running')].metadata.name}" 2>/dev/null)
-  echo "Waiting for pod $POD_NAME in namespace $NAMESPACE"
+
+  if [ -n "$CURRENT_POD" ]; then
+    echo "Waiting for pod $CURRENT_POD in namespace $NAMESPACE (current status: ${CURRENT_STATUS:-Unknown})"
+    # Show more debug info if pod exists but isn't Running
+    if [ -z "$POD_NAME" ] && [ $((ELAPSED % 60)) -eq 0 ]; then
+      echo "  DEBUG: Pod details:"
+      oc get pod "$CURRENT_POD" -n "$NAMESPACE" -o wide || true
+      echo "  DEBUG: Pod events:"
+      oc get events -n "$NAMESPACE" --field-selector involvedObject.name="$CURRENT_POD" --sort-by='.lastTimestamp' | tail -5 || true
+    fi
+  else
+    echo "Waiting for pod with label serving.kserve.io/inferenceservice=$ISVC_NAME in namespace $NAMESPACE (no pod found yet)"
+    # Show InferenceService status if no pod found
+    if [ $((ELAPSED % 60)) -eq 0 ]; then
+      echo "  DEBUG: InferenceService status:"
+      oc get inferenceservice "$ISVC_NAME" -n "$NAMESPACE" -o jsonpath='{.status.conditions}' || true
+      echo ""
+      echo "  DEBUG: All pods in namespace:"
+      oc get pods -n "$NAMESPACE" || true
+    fi
+  fi
 
   if [ -z "$POD_NAME" ]; then
-    echo "  -> Pod not running yet, waiting $INTERVAL seconds..."
+    echo "  -> Pod not running yet, waiting $INTERVAL seconds... ($ELAPSED/$TIMEOUT)"
     sleep $INTERVAL
     ELAPSED=$((ELAPSED + INTERVAL))
   fi
 done
 
-oc describe pod $POD_NAME -n $NAMESPACE || true
-oc logs $POD_NAME -n $NAMESPACE || true
-
-POD_NAME=$(oc get pods -n $NAMESPACE -o jsonpath='{.items[0].metadata.name}')
-
+# Exit immediately if no running pod was found
 if [ -z "$POD_NAME" ]; then
-  echo "  -> Timeout reached after $TIMEOUT seconds. Pod is not running."
-else
-  echo "  -> Pod is running: $POD_NAME"
+  echo ""
+  echo "❌ Timeout reached after $TIMEOUT seconds. Pod is not running."
+  echo ""
+  echo "DEBUG: InferenceService status:"
+  oc describe inferenceservice "$ISVC_NAME" -n "$NAMESPACE" || true
+  echo ""
+  echo "DEBUG: All pods in namespace:"
+  oc get pods -n "$NAMESPACE" -o wide || true
+  echo ""
+  echo "DEBUG: Recent events:"
+  oc get events -n "$NAMESPACE" --sort-by='.lastTimestamp' | tail -20 || true
+  exit 1
 fi
 
+echo "  -> Pod is running: $POD_NAME"
+
+# Show pod details
+oc describe pod "$POD_NAME" -n "$NAMESPACE" || true
+oc logs "$POD_NAME" -n "$NAMESPACE" --tail=50 || true
+
 # Get the 'app' label for Service selector
 APP_LABEL=$(oc get pod "$POD_NAME" -n "$NAMESPACE" -o jsonpath='{.metadata.labels.app}')
 if [ -z "$APP_LABEL" ]; then
@@ -47,13 +90,34 @@ if [ -z "$APP_LABEL" ]; then
 fi
 echo "  -> Found 'app' label: $APP_LABEL"
 
-# Get the Knative Service URL
-KSVC_URL=$(oc get ksvc "$KSVC_NAME" -n "$NAMESPACE" -o jsonpath='{.status.url}')
-if [ -z "$KSVC_URL" ]; then
-  echo "Error: Could not retrieve Knative URL for $KSVC_NAME"
-  exit 1
+# Check if this is RawDeployment mode (standard K8s Service) or Serverless (Knative Service)
+if oc get ksvc "$KSVC_NAME" -n "$NAMESPACE" &>/dev/null; then
+  # Serverless mode - get Knative Service URL
+  KSVC_URL=$(oc get ksvc "$KSVC_NAME" -n "$NAMESPACE" -o jsonpath='{.status.url}')
+  echo "  -> Found Knative Service URL: $KSVC_URL"
+else
+  # RawDeployment mode - construct URL from standard K8s Service
+  echo "  -> RawDeployment mode detected, looking for standard Kubernetes Service..."
+  SERVICE_NAME="${ISVC_NAME}-predictor"
+
+  # Check if the service exists
+  if ! oc get service "$SERVICE_NAME" -n "$NAMESPACE" &>/dev/null; then
+    echo "Error: Could not find Service $SERVICE_NAME"
+    exit 1
+  fi
+
+  # Get the cluster IP and targetPort (the actual container port)
+  CLUSTER_IP=$(oc get service "$SERVICE_NAME" -n "$NAMESPACE" -o jsonpath='{.spec.clusterIP}')
+  SERVICE_PORT=$(oc get service "$SERVICE_NAME" -n "$NAMESPACE" -o jsonpath='{.spec.ports[0].port}')
+  TARGET_PORT=$(oc get service "$SERVICE_NAME" -n "$NAMESPACE" -o jsonpath='{.spec.ports[0].targetPort}')
+
+  # Use targetPort (container port) instead of service port for RawDeployment
+  PORT=${TARGET_PORT:-$SERVICE_PORT}
+
+  # Construct internal cluster URL
+  KSVC_URL="http://${SERVICE_NAME}.${NAMESPACE}.svc.cluster.local:${PORT}"
+  echo "  -> Found Service URL: $KSVC_URL (Cluster IP: $CLUSTER_IP, Service Port: $SERVICE_PORT, Target Port: $TARGET_PORT)"
 fi
-echo "  -> Found Knative URL: $KSVC_URL"
 
 # Save all info to pod.env
 cat <<EOF > "$ENV_FILE"
diff --git a/tests/e2e-prow/rhoai/scripts/gpu-setup.sh b/tests/e2e-prow/rhoai/scripts/gpu-setup.sh
new file mode 100755
index 000000000..d72d744bb
--- /dev/null
+++ b/tests/e2e-prow/rhoai/scripts/gpu-setup.sh
@@ -0,0 +1,208 @@
+#!/bin/bash
+
+set -euo pipefail
+
+BASE_DIR="$1"
+
+echo "Setting up GPU support..."
+
+# Debug: Show all nodes and their instance types
+echo ""
+echo "--> DEBUG: Cluster nodes before GPU setup..."
+oc get nodes -o custom-columns=NAME:.metadata.name,INSTANCE:.metadata.labels.node\\.kubernetes\\.io/instance-type,STATUS:.status.conditions[-1].type
+
+# Debug: Check for GPU instance types and taints
+echo ""
+echo "--> DEBUG: Checking for GPU nodes and taints..."
+gpu_nodes=$(oc get nodes -o jsonpath='{range .items[*]}{.metadata.name}{","}{.metadata.labels.node\.kubernetes\.io/instance-type}{"\n"}{end}' | grep -E "g4dn|p3|p4|g5" | cut -d',' -f1 || echo "")
+
+if [ -n "$gpu_nodes" ]; then
+  echo "    Found GPU instance types:"
+  for node in $gpu_nodes; do
+    echo "    Node: $node"
+    echo "    Instance Type: $(oc get node $node -o jsonpath='{.metadata.labels.node\.kubernetes\.io/instance-type}')"
+    echo "    Taints:"
+    oc get node $node -o jsonpath='{.spec.taints}' || echo "      No taints"
+    echo ""
+  done
+else
+  echo "    No GPU instance types found (g4dn, p3, p4, g5)"
+  echo "    All node instance types:"
+  oc get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.metadata.labels.node\.kubernetes\.io/instance-type}{"\n"}{end}'
+fi
+
+# Apply NFD instance
+echo ""
+echo "--> Applying NFD instance..."
+oc apply -f "$BASE_DIR/manifests/gpu/create-nfd.yaml"
+
+# Wait for NFD worker daemonset to be created
+echo "--> Waiting for NFD worker daemonset to be created..."
+timeout=60
+elapsed=0
+until oc get daemonset nfd-worker -n openshift-nfd &>/dev/null; do
+  if [ $elapsed -ge $timeout ]; then
+    echo "❌ Timeout waiting for NFD worker daemonset"
+    exit 1
+  fi
+  echo "   Waiting for nfd-worker daemonset... ($elapsed/$timeout seconds)"
+  sleep 5
+  elapsed=$((elapsed + 5))
+done
+echo "✅ NFD worker daemonset created"
+
+# Patch NFD worker daemonset to add GPU node tolerations
+# This is needed in the prow env to be assigned a GPU
+echo "--> Patching NFD worker daemonset with GPU tolerations..."
+oc patch daemonset nfd-worker -n openshift-nfd --type=json -p='[
+  {
+    "op": "add",
+    "path": "/spec/template/spec/tolerations",
+    "value": [
+      {
+        "key": "nvidia.com/gpu",
+        "operator": "Exists",
+        "effect": "NoSchedule"
+      },
+      {
+        "key": "gpu",
+        "operator": "Exists",
+        "effect": "NoSchedule"
+      }
+    ]
+  }
+]'
+echo "✅ NFD worker tolerations added"
+
+# Apply ClusterPolicy
+echo ""
+echo "--> Applying ClusterPolicy..."
+oc apply -f "$BASE_DIR/manifests/gpu/cluster-policy.yaml"
+
+# Wait for GPU operator pods to be created and healthy
+echo ""
+echo "--> Waiting for GPU operator pods to be healthy..."
+echo "    This may take up to 10 minutes while images are pulled and pods start..."
+timeout=1200
+elapsed=0
+until oc get pods -n nvidia-gpu-operator --no-headers 2>/dev/null | awk '{if ($3 != "Running" && $3 != "Completed") exit 1}' && [ $(oc get pods -n nvidia-gpu-operator --no-headers 2>/dev/null | wc -l) -gt 5 ]; do
+  if [ $elapsed -ge $timeout ]; then
+    echo "❌ Timeout waiting for GPU operator pods to be healthy"
+    echo "Current pod status:"
+    oc get pods -n nvidia-gpu-operator
+    echo ""
+    echo "DEBUG: Checking for scheduling issues..."
+    oc get pods -n nvidia-gpu-operator -o wide
+    echo ""
+    echo "DEBUG: Checking pod events for failures..."
+    oc get events -n nvidia-gpu-operator --sort-by='.lastTimestamp' | tail -20
+    exit 1
+  fi
+  pod_count=$(oc get pods -n nvidia-gpu-operator --no-headers 2>/dev/null | wc -l || echo 0)
+  failed_pods=$(oc get pods -n nvidia-gpu-operator --no-headers 2>/dev/null | awk '{if ($3 != "Running" && $3 != "Completed") print $1}' | wc -l || echo 0)
+  echo "   Pods: $pod_count total, $failed_pods not ready. Waiting... ($elapsed/$timeout seconds)"
+
+  # Show additional debug info every 60 seconds
+  if [ $((elapsed % 60)) -eq 0 ] && [ $elapsed -gt 0 ]; then
+    echo "   DEBUG: Current pod statuses:"
+    oc get pods -n nvidia-gpu-operator -o wide
+  fi
+
+  sleep 15
+  elapsed=$((elapsed + 15))
+done
+echo "✅ All GPU operator pods are healthy"
+
+# Debug: Show what pods are running
+echo ""
+echo "--> DEBUG: GPU operator pods deployed:"
+oc get pods -n nvidia-gpu-operator -o wide
+
+# Wait for GPU nodes to be labeled by NFD
+echo ""
+echo "--> Waiting for GPU nodes to be labeled by NFD..."
+timeout=120
+elapsed=0
+until oc get nodes -l nvidia.com/gpu.present=true --no-headers 2>/dev/null | grep -q .; do
+  if [ $elapsed -ge $timeout ]; then
+    echo "❌ Timeout waiting for GPU nodes to be labeled"
+    echo ""
+    echo "DEBUG: Checking why nodes aren't labeled..."
+    echo "All node labels related to features:"
+    oc get nodes --show-labels | grep -E "feature|gpu|nvidia" || echo "No GPU/feature labels found on any nodes"
+    echo ""
+    echo "DEBUG: NFD worker pods status:"
+    oc get pods -n openshift-nfd -o wide
+    echo ""
+    echo "DEBUG: Recent NFD events:"
+    oc get events -n openshift-nfd --sort-by='.lastTimestamp' | tail -10
+    exit 1
+  fi
+  echo "   No GPU nodes found yet. Waiting... ($elapsed/$timeout seconds)"
+
+  # Show debug info every 30 seconds
+  if [ $((elapsed % 30)) -eq 0 ] && [ $elapsed -gt 0 ]; then
+    echo "   DEBUG: Checking NFD worker pods..."
+    oc get pods -n openshift-nfd --no-headers
+  fi
+
+  sleep 10
+  elapsed=$((elapsed + 10))
+done
+echo "✅ GPU nodes detected"
+
+# Debug: Show labeled nodes
+echo ""
+echo "--> DEBUG: GPU nodes labeled:"
+oc get nodes -l nvidia.com/gpu.present=true -o custom-columns=NAME:.metadata.name,INSTANCE:.metadata.labels.node\\.kubernetes\\.io/instance-type,TAINTS:.spec.taints
+
+# Wait for GPU capacity AND allocatable to become available
+echo "--> Waiting for GPU capacity and allocatable to be available on nodes..."
+timeout=120
+elapsed=0
+until [ "$(oc get nodes -l nvidia.com/gpu.present=true -o jsonpath='{.items[0].status.capacity.nvidia\.com/gpu}' 2>/dev/null)" != "" ] && \
+      [ "$(oc get nodes -l nvidia.com/gpu.present=true -o jsonpath='{.items[0].status.capacity.nvidia\.com/gpu}' 2>/dev/null)" != "0" ]; do
+  if [ $elapsed -ge $timeout ]; then
+    echo "❌ Timeout waiting for GPU capacity/allocatable"
+    echo ""
+    echo "DEBUG: Investigating why GPU capacity is not appearing..."
+    echo "Device plugin pods:"
+    oc get pods -n nvidia-gpu-operator -l app=nvidia-device-plugin-daemonset -o wide
+    echo ""
+    echo "Device plugin pod logs (last 20 lines):"
+    device_plugin_pod=$(oc get pods -n nvidia-gpu-operator -l app=nvidia-device-plugin-daemonset -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
+    if [ -n "$device_plugin_pod" ]; then
+      oc logs -n nvidia-gpu-operator "$device_plugin_pod" --tail=20 || echo "Could not fetch logs"
+    else
+      echo "No device plugin pod found - checking for scheduling issues..."
+      oc get events -n nvidia-gpu-operator --field-selector involvedObject.name=nvidia-device-plugin-daemonset --sort-by='.lastTimestamp' | tail -10
+    fi
+    echo ""
+    echo "Node GPU capacity details:"
+    oc get nodes -l nvidia.com/gpu.present=true -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{"capacity: "}{.status.capacity.nvidia\.com/gpu}{"\t"}{"allocatable: "}{.status.allocatable.nvidia\.com/gpu}{"\n"}{end}'
+    exit 1
+  fi
+  capacity=$(oc get nodes -l nvidia.com/gpu.present=true -o jsonpath='{.items[0].status.capacity.nvidia\.com/gpu}' 2>/dev/null || echo "0")
+  allocatable=$(oc get nodes -l nvidia.com/gpu.present=true -o jsonpath='{.items[0].status.allocatable.nvidia\.com/gpu}' 2>/dev/null || echo "0")
+  echo "   GPU capacity: $capacity, allocatable: $allocatable. Waiting for both > 0... ($elapsed/$timeout seconds)"
+
+  # Show debug info every 30 seconds
+  if [ $((elapsed % 30)) -eq 0 ] && [ $elapsed -gt 0 ]; then
+    echo "   DEBUG: Checking device plugin daemonset pods..."
+    oc get pods -n nvidia-gpu-operator -l app=nvidia-device-plugin-daemonset --no-headers
+  fi
+
+  sleep 15
+  elapsed=$((elapsed + 15))
+done
+
+echo ""
+echo "✅ GPU setup complete!"
+echo ""
+echo "GPU Node Status:"
+oc get nodes -l nvidia.com/gpu.present=true -o custom-columns=NAME:.metadata.name,GPU:.status.capacity.nvidia\\.com/gpu,ALLOCATABLE:.status.allocatable.nvidia\\.com/gpu,INSTANCE:.metadata.labels.node\\.kubernetes\\.io/instance-type
+
+echo ""
+echo "ClusterPolicy Status:"
+oc get clusterpolicy gpu-cluster-policy -o jsonpath='{.status.state}'
+echo ""