Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ RUN wget -O awscli.zip https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip
unzip awscli.zip && \
./aws/install
# we need gsutil from the gcloud CLI for kubetest-tester-ginkgo
RUN amazon-linux-extras install python3.8
RUN amazon-linux-extras install python3.11
ARG GCLOUD_SDK_URL=https://dl.google.com/dl/cloudsdk/channels/rapid/google-cloud-sdk.tar.gz
RUN wget -O google-cloud-sdk.tar.gz -q $GCLOUD_SDK_URL && \
tar xzf google-cloud-sdk.tar.gz -C / && \
Expand Down
4 changes: 3 additions & 1 deletion test/cases/nvidia/unit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
_ "embed"
"fmt"
"testing"
"time"

fwext "github.com/aws/aws-k8s-tester/internal/e2e"
"sigs.k8s.io/e2e-framework/klient/wait"
Expand Down Expand Up @@ -66,7 +67,8 @@ func TestSingleNodeUnitTest(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{Name: "unit-test-job", Namespace: "default"},
}
err := wait.For(fwext.NewConditionExtension(cfg.Client().Resources()).JobSucceeded(job),
wait.WithContext(ctx))
wait.WithContext(ctx),
wait.WithTimeout(10*time.Minute))
if err != nil {
t.Fatal(err)
}
Expand Down
9 changes: 9 additions & 0 deletions test/images/nvidia/gpu_unit_tests/tests/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,12 @@ generate_data()
eval "$cmd" > $expected
_assert_data "$expected" "$cmd" "$msg"
}

function is_vgpu()
{
local instance_type=${EC2_INSTANCE_TYPE:-$(get_instance_type)}
case "${instance_type}" in
g6f.*|gr6f.*) return ;;
*) return 1 ;; # Not supported
esac
}
5 changes: 5 additions & 0 deletions test/images/nvidia/gpu_unit_tests/tests/test_basic.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ test_03_nvbandwidth()

test_04_dcgm_diagnostics()
{
# This test is not applicable for vGPU instance types.
if is_vgpu; then
skip "This test does not apply to vGPU instances (g6f.*, gr6f.*)"
fi

# https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/dcgm-diagnostics.html#run-levels-and-tests
if [[ $EC2_INSTANCE_TYPE == g* ]]; then
# The G series instance don't have nvlink and GPU p2p communication
Expand Down
17 changes: 17 additions & 0 deletions test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,26 @@ test_nvidia_gpu_unused()
test_nvidia_gpu_throttled()
{

# vGPU instances don't support GPU clock throttling detection.
# This test is not applicable for vGPU instance types.
if is_vgpu; then
skip "This test does not apply to vGPU instances (g6f.*, gr6f.*)"
fi
# https://docs.nvidia.com/deploy/nvml-api/group__nvmlClocksEventReasons.html#group__nvmlClocksEventReasons
# The only bit allowed is nvmlClocksEventReasonGpuIdle 0x0000000000000001LL
filter="egrep -v -e '(0x0000000000000000|0x0000000000000001|0x0000000000000004)'"
cmd="nvidia-smi --query-gpu index,gpu_bus_id,gpu_uuid,clocks_throttle_reasons.active --format=csv,noheader"
assert_status_code 1 "$cmd | $filter" "Throttled gpu detected"
}


test_nvidia_vgpu_license_status()
{
if ! is_vgpu; then
skip "This test only applies to vGPU instances (g6f.*, gr6f.*)"
fi

assert_data $data/nvidia_vgpu_license_status.txt \
"nvidia-smi -q | grep 'vGPU Software' -A 2" \
"vGPU license status validation failed"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name, index, pci.bus_id
NVIDIA L4-12Q, 0, 00000000:35:00.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
/sys/devices/system/node/node0/cpulist:0-15
/sys/devices/system/node/node0/distance:10
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name, pci.bus_id, persistence_mode
NVIDIA L4-12Q, 00000000:35:00.0, Enabled
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
GPU0 CPU Affinity NUMA Affinity GPU NUMA ID
GPU0 X 0-15 0 N/A
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
vGPU Software Licensed Product
Product Name : NVIDIA RTX Virtual Workstation
License Status : Licensed (Expiry: N/A)
Loading