Skip to content

Commit 58027f9

Browse files
committed
Add Scenario for AI conformance
Related to: - #17830 Signed-off-by: Arnaud Meukam <ameukam@gmail.com>
1 parent 3fae22e commit 58027f9

1 file changed

Lines changed: 158 additions & 0 deletions

File tree

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
#!/usr/bin/env bash
2+
3+
# Copyright 2026 The Kubernetes Authors.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
set -o errexit
18+
set -o nounset
19+
set -o pipefail
20+
21+
REPO_ROOT=$(git rev-parse --show-toplevel)
22+
source "${REPO_ROOT}"/tests/e2e/scenarios/lib/common.sh
23+
24+
# AI Conformance requirements:
25+
# - Kubernetes 1.35
26+
# - NVIDIA L4 Instances (g6.xlarge on AWS)
27+
# - Gateway API
28+
# - Gang Scheduling (Kueue)
29+
# - Robust Controller (KubeRay)
30+
31+
export K8S_VERSION=$(curl -L -s https://dl.k8s.io/release/stable.txt)
32+
export CLOUD_PROVIDER=aws
33+
# Ensure region with L4 (g6) availability
34+
export AWS_REGION="${AWS_REGION:-us-east-2}"
35+
36+
# Check for g6.xlarge availability in the region
37+
echo "Checking availability of g6.xlarge in ${AWS_REGION}..."
38+
AVAILABILITY=$(aws ec2 describe-instance-type-offerings --location-type availability-zone --filters Name=instance-type,Values=g6.xlarge --region "${AWS_REGION}" --query 'InstanceTypeOfferings' --output text)
39+
if [[ -z "${AVAILABILITY}" ]]; then
40+
echo "Error: g6.xlarge instances are not available in ${AWS_REGION}. Please choose a region with L4 GPU support."
41+
exit 1
42+
fi
43+
44+
kops-acquire-latest
45+
46+
# Cluster Configuration
47+
# - Networking: Cilium with Gateway API enabled
48+
# - Nodes: g6.xlarge (L4 GPU)
49+
# - Runtime: NVIDIA enabled
50+
OVERRIDES="${OVERRIDES-} --networking=cilium"
51+
OVERRIDES="${OVERRIDES} --set=cluster.spec.networking.cilium.enableGatewayAPI=true"
52+
OVERRIDES="${OVERRIDES} --node-size=g6.xlarge"
53+
OVERRIDES="${OVERRIDES} --node-count=2"
54+
OVERRIDES="${OVERRIDES} --set=cluster.spec.containerd.nvidia.enabled=true"
55+
56+
kops-up
57+
58+
echo "----------------------------------------------------------------"
59+
echo "Deploying AI Conformance Components"
60+
echo "----------------------------------------------------------------"
61+
62+
# 0. Gateway API CRDs (Required for Cilium)
63+
echo "Installing Gateway API CRDs v1.2.0..."
64+
kubectl apply -f https://github.com/kubernetes-sigs/gateway-api/releases/download/v1.2.0/standard-install.yaml
65+
66+
# 1. NVIDIA Device Plugin
67+
echo "Installing NVIDIA Device Plugin..."
68+
kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.17.0/nvidia-device-plugin.yml
69+
70+
# 1.5 NVIDIA DRA Driver
71+
echo "Installing Helm..."
72+
curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3
73+
chmod 700 get_helm.sh
74+
USE_SUDO=false HELM_INSTALL_DIR=. ./get_helm.sh
75+
export PATH="$(pwd):$PATH"
76+
77+
echo "Installing NVIDIA DRA Driver..."
78+
helm repo add nvidia https://helm.ngc.nvidia.com/nvidia
79+
helm repo update
80+
helm install nvidia-dra-driver-gpu nvidia/nvidia-dra-driver-gpu \
81+
--create-namespace \
82+
--namespace nvidia-dra-driver-gpu \
83+
--version 25.8.1 \
84+
--set resources.gpus.enabled=true \
85+
--wait
86+
87+
# 2. Gang Scheduling (Kueue)
88+
echo "Installing Kueue..."
89+
kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.14.8/manifests.yaml
90+
91+
# 3. Robust Controller (KubeRay)
92+
echo "Installing KubeRay Operator..."
93+
# KubeRay 1.3.0
94+
kubectl apply -k "github.com/ray-project/kuberay/ray-operator/config/default?ref=v1.5.0"
95+
96+
echo "----------------------------------------------------------------"
97+
echo "Verifying Cluster and Components"
98+
echo "----------------------------------------------------------------"
99+
100+
# Wait for kOps validation
101+
"${KOPS}" validate cluster --wait=15m
102+
103+
# Verify Components
104+
echo "Verifying NVIDIA Device Plugin..."
105+
kubectl rollout status daemonset -n kube-system nvidia-device-plugin-daemonset --timeout=5m || echo "Warning: NVIDIA Device Plugin not ready yet"
106+
107+
echo "Verifying Kueue..."
108+
kubectl rollout status deployment -n kueue-system kueue-controller-manager --timeout=5m || echo "Warning: Kueue not ready yet"
109+
110+
echo "Verifying KubeRay..."
111+
kubectl rollout status deployment -n kuberay-system kuberay-operator --timeout=5m || echo "Warning: KubeRay not ready yet"
112+
113+
echo "Verifying Gateway API..."
114+
kubectl get gatewayclass || echo "Warning: GatewayClass not found"
115+
116+
echo "Verifying Allocatable GPUs..."
117+
# Wait a bit for nodes to report resources
118+
sleep 30
119+
kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}: {.status.allocatable.nvidia\.com/gpu} GPUs{"\n"}{end}'
120+
121+
echo "Running Sample DRA Workload..."
122+
# Create a ResourceClaim and Pod to test DRA
123+
kubectl apply -f - <<EOF
124+
apiVersion: resource.k8s.io/v1
125+
kind: ResourceClaim
126+
metadata:
127+
name: test-gpu-claim
128+
spec:
129+
resourceClassName: nvidia-gpu
130+
---
131+
apiVersion: v1
132+
kind: Pod
133+
metadata:
134+
name: test-gpu-pod
135+
spec:
136+
restartPolicy: Never
137+
containers:
138+
- name: test
139+
image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0-ubuntu22.04
140+
command: ["/bin/sh", "-c"]
141+
args: ["/cuda-samples/vectorAdd"]
142+
resources:
143+
claims:
144+
- name: gpu
145+
resourceClaims:
146+
- name: gpu
147+
resourceClaimName: test-gpu-claim
148+
EOF
149+
150+
echo "Waiting for Sample Workload to Complete..."
151+
# Wait for the pod to succeed
152+
kubectl wait --for=condition=Ready pod/test-gpu-pod --timeout=5m || true
153+
kubectl logs test-gpu-pod || echo "Failed to get logs"
154+
155+
# Note: The actual AI conformance test suite (e.g., k8s-ai-conformance binary)
156+
# would be executed here. For this scenario, we establish the compliant environment.
157+
158+
echo "AI Conformance Environment Setup Complete."

0 commit comments

Comments
 (0)