From a3db8f216b925a8b12941481e1c67abb8616868e Mon Sep 17 00:00:00 2001 From: Shiv Bhosale Date: Fri, 3 Oct 2025 22:13:33 +0000 Subject: [PATCH 1/3] feat: add test cases to check vGPU license status --- .../nvidia/gpu_unit_tests/tests/common.sh | 20 +++++++++++++++++++ .../gpu_unit_tests/tests/test_sysinfo.sh | 15 ++++++++++++++ .../g6f.4xlarge/efa_count.txt | 1 + .../g6f.4xlarge/gpu_count.txt | 2 ++ .../g6f.4xlarge/numa_topo.txt | 2 ++ .../g6f.4xlarge/nvidia_persistence_status.txt | 2 ++ .../g6f.4xlarge/nvidia_smi_topo.txt | 2 ++ .../nvidia_vgpu_license_status.txt | 3 +++ 8 files changed, 47 insertions(+) create mode 100644 test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/efa_count.txt create mode 100644 test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/gpu_count.txt create mode 100644 test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/numa_topo.txt create mode 100644 test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/nvidia_persistence_status.txt create mode 100644 test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/nvidia_smi_topo.txt create mode 100644 test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/nvidia_vgpu_license_status.txt diff --git a/test/images/nvidia/gpu_unit_tests/tests/common.sh b/test/images/nvidia/gpu_unit_tests/tests/common.sh index a6f5222d7..655f43e6b 100644 --- a/test/images/nvidia/gpu_unit_tests/tests/common.sh +++ b/test/images/nvidia/gpu_unit_tests/tests/common.sh @@ -53,3 +53,23 @@ generate_data() eval "$cmd" > $expected _assert_data "$expected" "$cmd" "$msg" } + +skip_for_vgpu_instances() { + local test_name="$1" + local instance_type="$(get_instance_type)" + case "$instance_type" in + g6f*) echo "skipping $test_name: no current support for instance_type: $instance_type" + exit 0;; + gr6f*) echo "skipping $test_name: no current support for instance_type: $instance_type" + exit 0;; + esac +} + +function is_vgpu() +{ + local instance_type="$(get_instance_type)" + case "${instance_type}" in + g6f.*|gr6f.*) return ;; + *) return 1 ;; # Not supported + esac +} diff --git a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh index 5dc282b3b..b0880db78 100644 --- a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh +++ b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh @@ -61,9 +61,24 @@ test_nvidia_gpu_unused() test_nvidia_gpu_throttled() { + # vGPU instances don't support GPU clock throttling detection. + # This test is not applicable for vGPU instance types. + skip_for_vgpu_instances # https://docs.nvidia.com/deploy/nvml-api/group__nvmlClocksEventReasons.html#group__nvmlClocksEventReasons # The only bit allowed is nvmlClocksEventReasonGpuIdle 0x0000000000000001LL filter="egrep -v -e '(0x0000000000000000|0x0000000000000001|0x0000000000000004)'" cmd="nvidia-smi --query-gpu index,gpu_bus_id,gpu_uuid,clocks_throttle_reasons.active --format=csv,noheader" assert_status_code 1 "$cmd | $filter" "Throttled gpu detected" } + + +test_nvidia_vgpu_license_status() +{ + if ! is_vgpu; then + skip "This test only applies to vGPU instances (g6f.*, gr6f.*)" + fi + + assert_golden $data/nvidia_vgpu_license_status.txt \ + "nvidia-smi -q | grep 'vGPU Software' -A 2" \ + "vGPU license status validation failed" +} \ No newline at end of file diff --git a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/efa_count.txt b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/efa_count.txt new file mode 100644 index 000000000..573541ac9 --- /dev/null +++ b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/efa_count.txt @@ -0,0 +1 @@ +0 diff --git a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/gpu_count.txt b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/gpu_count.txt new file mode 100644 index 000000000..a6e53172e --- /dev/null +++ b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/gpu_count.txt @@ -0,0 +1,2 @@ +name, index, pci.bus_id +NVIDIA L4-12Q, 0, 00000000:35:00.0 diff --git a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/numa_topo.txt b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/numa_topo.txt new file mode 100644 index 000000000..ed6c897aa --- /dev/null +++ b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/numa_topo.txt @@ -0,0 +1,2 @@ +/sys/devices/system/node/node0/cpulist:0-15 +/sys/devices/system/node/node0/distance:10 diff --git a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/nvidia_persistence_status.txt b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/nvidia_persistence_status.txt new file mode 100644 index 000000000..b6a99abc3 --- /dev/null +++ b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/nvidia_persistence_status.txt @@ -0,0 +1,2 @@ +name, pci.bus_id, persistence_mode +NVIDIA L4-12Q, 00000000:35:00.0, Enabled diff --git a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/nvidia_smi_topo.txt b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/nvidia_smi_topo.txt new file mode 100644 index 000000000..39eb6d6f2 --- /dev/null +++ b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/nvidia_smi_topo.txt @@ -0,0 +1,2 @@ + GPU0 CPU Affinity NUMA Affinity GPU NUMA ID +GPU0 X 0-15 0 N/A diff --git a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/nvidia_vgpu_license_status.txt b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/nvidia_vgpu_license_status.txt new file mode 100644 index 000000000..0fca4dfc4 --- /dev/null +++ b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/nvidia_vgpu_license_status.txt @@ -0,0 +1,3 @@ + vGPU Software Licensed Product + Product Name : NVIDIA RTX Virtual Workstation + License Status : Licensed (Expiry: N/A) From 4506d0ca3501876316ca0cb8c8058cccc4e160df Mon Sep 17 00:00:00 2001 From: Shiv Bhosale Date: Wed, 22 Oct 2025 06:12:06 +0000 Subject: [PATCH 2/3] update is_vgpu function, skip test_04_dcgm_diagnostics for vgpu instances and increase unit test timeout to 10 mins --- test/cases/nvidia/unit_test.go | 4 +++- test/images/nvidia/gpu_unit_tests/tests/common.sh | 13 +------------ .../nvidia/gpu_unit_tests/tests/test_basic.sh | 5 +++++ .../nvidia/gpu_unit_tests/tests/test_sysinfo.sh | 6 ++++-- 4 files changed, 13 insertions(+), 15 deletions(-) diff --git a/test/cases/nvidia/unit_test.go b/test/cases/nvidia/unit_test.go index 27918e878..d703620b1 100644 --- a/test/cases/nvidia/unit_test.go +++ b/test/cases/nvidia/unit_test.go @@ -7,6 +7,7 @@ import ( _ "embed" "fmt" "testing" + "time" fwext "github.com/aws/aws-k8s-tester/internal/e2e" "sigs.k8s.io/e2e-framework/klient/wait" @@ -66,7 +67,8 @@ func TestSingleNodeUnitTest(t *testing.T) { ObjectMeta: metav1.ObjectMeta{Name: "unit-test-job", Namespace: "default"}, } err := wait.For(fwext.NewConditionExtension(cfg.Client().Resources()).JobSucceeded(job), - wait.WithContext(ctx)) + wait.WithContext(ctx), + wait.WithTimeout(10*time.Minute)) if err != nil { t.Fatal(err) } diff --git a/test/images/nvidia/gpu_unit_tests/tests/common.sh b/test/images/nvidia/gpu_unit_tests/tests/common.sh index 655f43e6b..140e05bda 100644 --- a/test/images/nvidia/gpu_unit_tests/tests/common.sh +++ b/test/images/nvidia/gpu_unit_tests/tests/common.sh @@ -54,20 +54,9 @@ generate_data() _assert_data "$expected" "$cmd" "$msg" } -skip_for_vgpu_instances() { - local test_name="$1" - local instance_type="$(get_instance_type)" - case "$instance_type" in - g6f*) echo "skipping $test_name: no current support for instance_type: $instance_type" - exit 0;; - gr6f*) echo "skipping $test_name: no current support for instance_type: $instance_type" - exit 0;; - esac -} - function is_vgpu() { - local instance_type="$(get_instance_type)" + local instance_type=${EC2_INSTANCE_TYPE:-$(get_instance_type)} case "${instance_type}" in g6f.*|gr6f.*) return ;; *) return 1 ;; # Not supported diff --git a/test/images/nvidia/gpu_unit_tests/tests/test_basic.sh b/test/images/nvidia/gpu_unit_tests/tests/test_basic.sh index 518e1fc16..9ffc0c0c3 100644 --- a/test/images/nvidia/gpu_unit_tests/tests/test_basic.sh +++ b/test/images/nvidia/gpu_unit_tests/tests/test_basic.sh @@ -31,6 +31,11 @@ test_03_nvbandwidth() test_04_dcgm_diagnostics() { + # This test is not applicable for vGPU instance types. + if is_vgpu; then + skip "This test does not apply to vGPU instances (g6f.*, gr6f.*)" + fi + # https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/dcgm-diagnostics.html#run-levels-and-tests if [[ $EC2_INSTANCE_TYPE == g* ]]; then # The G series instance don't have nvlink and GPU p2p communication diff --git a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh index b0880db78..909fe9b90 100644 --- a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh +++ b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh @@ -63,7 +63,9 @@ test_nvidia_gpu_throttled() # vGPU instances don't support GPU clock throttling detection. # This test is not applicable for vGPU instance types. - skip_for_vgpu_instances + if is_vgpu; then + skip "This test does not apply to vGPU instances (g6f.*, gr6f.*)" + fi # https://docs.nvidia.com/deploy/nvml-api/group__nvmlClocksEventReasons.html#group__nvmlClocksEventReasons # The only bit allowed is nvmlClocksEventReasonGpuIdle 0x0000000000000001LL filter="egrep -v -e '(0x0000000000000000|0x0000000000000001|0x0000000000000004)'" @@ -78,7 +80,7 @@ test_nvidia_vgpu_license_status() skip "This test only applies to vGPU instances (g6f.*, gr6f.*)" fi - assert_golden $data/nvidia_vgpu_license_status.txt \ + assert_data $data/nvidia_vgpu_license_status.txt \ "nvidia-smi -q | grep 'vGPU Software' -A 2" \ "vGPU license status validation failed" } \ No newline at end of file From 5e752645c124d99080bcadc6eeea51c741bc87e9 Mon Sep 17 00:00:00 2001 From: Shiv Bhosale Date: Wed, 22 Oct 2025 06:25:48 +0000 Subject: [PATCH 3/3] feat: update dockerfile to be compatible with latest GCLOUD_SDK_URL --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 2dc2b7d32..5a6d0dcd5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -29,7 +29,7 @@ RUN wget -O awscli.zip https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip unzip awscli.zip && \ ./aws/install # we need gsutil from the gcloud CLI for kubetest-tester-ginkgo -RUN amazon-linux-extras install python3.8 +RUN amazon-linux-extras install python3.11 ARG GCLOUD_SDK_URL=https://dl.google.com/dl/cloudsdk/channels/rapid/google-cloud-sdk.tar.gz RUN wget -O google-cloud-sdk.tar.gz -q $GCLOUD_SDK_URL && \ tar xzf google-cloud-sdk.tar.gz -C / && \