Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 10 additions & 8 deletions e2e2/test/cases/nvidia/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,15 @@ import (
)

var (
testenv env.Environment
nodeType *string
installDevicePlugin *bool
efaEnabled *bool
nvidiaTestImage *string
nodeCount int
gpuPerNode int
efaPerNode int
testenv env.Environment
nodeType *string
installDevicePlugin *bool
efaEnabled *bool
nvidiaTestImage *string
skipUnitTestSubcommand *string
nodeCount int
gpuPerNode int
efaPerNode int
)

var (
Expand Down Expand Up @@ -134,6 +135,7 @@ func TestMain(m *testing.M) {
nvidiaTestImage = flag.String("nvidiaTestImage", "", "nccl test image for nccl tests")
efaEnabled = flag.Bool("efaEnabled", false, "enable efa tests")
installDevicePlugin = flag.Bool("installDevicePlugin", true, "install nvidia device plugin")
skipUnitTestSubcommand = flag.String("skipUnitTestSubcommand", "", "optional command to skip specified unit test, `-s test1|test2|...`")
cfg, err := envconf.NewFromFlags()
if err != nil {
log.Fatalf("failed to initialize test environment: %v", err)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ spec:
command:
- /bin/bash
- ./gpu_unit_tests/unit_test
env:
- name: SKIP_TESTS_SUBCOMMAND
value: {{.SkipTestCommand}}
imagePullPolicy: Always
resources:
limits:
Expand Down
8 changes: 6 additions & 2 deletions e2e2/test/cases/nvidia/unit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@ var (
)

type unitTestManifestTplVars struct {
NvidiaTestImage string
NvidiaTestImage string
SkipTestSubcommand string
GpuPerNode int
}

type hpcTestManifestTplVars struct {
Expand All @@ -42,7 +44,9 @@ func TestSingleNodeUnitTest(t *testing.T) {
}
var err error
renderedJobUnitTestSingleNodeManifest, err = fwext.RenderManifests(jobUnitTestSingleNodeManifest, unitTestManifestTplVars{
NvidiaTestImage: *nvidiaTestImage,
NvidiaTestImage: *nvidiaTestImage,
SkipTestSubcommand: *skipUnitTestSubcommand,
GpuPerNode: gpuPerNode,
})
if err != nil {
t.Fatal(err)
Expand Down
20 changes: 10 additions & 10 deletions e2e2/test/images/nvidia/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ ARG CUDA_MAJOR_VERSION=12
ARG CUDA_MINOR_VERSION=5

# Start with the NVIDIA CUDA base image
FROM nvidia/cuda:${CUDA_MAJOR_VERSION}.${CUDA_MINOR_VERSION}.1-devel-ubuntu${UBUNTU_MAJOR_VERSION}.04
FROM nvidia/cuda:$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION.1-devel-ubuntu$UBUNTU_MAJOR_VERSION.04

ARG UBUNTU_MAJOR_VERSION
ARG CUDA_MAJOR_VERSION
Expand Down Expand Up @@ -41,7 +41,7 @@ RUN apt install -y \
cmake \
apt-utils \
libhwloc-dev \
cuda-demo-suite-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
cuda-demo-suite-$CUDA_MAJOR_VERSION-$CUDA_MINOR_VERSION \
datacenter-gpu-manager

RUN mkdir -p /var/run/sshd \
Expand All @@ -55,24 +55,24 @@ ENV PATH /usr/local/cuda/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/sb
# Install EFA
ARG EFA_INSTALLER_VERSION=latest
RUN cd /tmp \
&& curl -sL https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz | tar xvz \
&& curl -sL https://efa-installer.amazonaws.com/aws-efa-installer-$EFA_INSTALLER_VERSION.tar.gz | tar xvz \
&& cd aws-efa-installer \
&& ./efa_installer.sh --yes --enable-gdr --skip-kmod --skip-limit-conf --no-verify --mpi openmpi5 \
&& rm -rf /tmp/* \
/var/lib/apt/lists/*

# Install NCCL
ARG NCCL_VERSION=2.22.3-1+cuda${CUDA_MAJOR_VERSION}.${CUDA_MINOR_VERSION}
ARG NCCL_VERSION=2.22.3-1+cuda12.5
RUN apt update \
&& apt install -y \
libnccl2=${NCCL_VERSION} \
libnccl-dev=${NCCL_VERSION}
libnccl2=$NCCL_VERSION \
libnccl-dev=$NCCL_VERSION

# Install AWS-OFI-NCCL plugin
ARG AWS_OFI_NCCL_VERSION=1.11.0-aws
RUN cd tmp \
&& curl -sL https://github.com/aws/aws-ofi-nccl/releases/download/v${AWS_OFI_NCCL_VERSION}/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION}.tar.gz | tar xvz \
&& cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION} \
&& curl -sL https://github.com/aws/aws-ofi-nccl/releases/download/v$AWS_OFI_NCCL_VERSION/aws-ofi-nccl-$AWS_OFI_NCCL_VERSION.tar.gz | tar xvz \
&& cd aws-ofi-nccl-$AWS_OFI_NCCL_VERSION \
&& ./configure --prefix=/opt/aws-ofi-nccl/install \
--with-mpi=/opt/amazon/openmpi \
--with-libfabric=/opt/amazon/efa \
Expand All @@ -85,8 +85,8 @@ RUN cd tmp \
# Install NCCL Tests
ARG NCCL_TESTS_VERSION=2.13.10
RUN cd /tmp \
&& curl -sL https://github.com/NVIDIA/nccl-tests/archive/refs/tags/v${NCCL_TESTS_VERSION}.tar.gz | tar xvz \
&& cd nccl-tests-${NCCL_TESTS_VERSION} \
&& curl -sL https://github.com/NVIDIA/nccl-tests/archive/refs/tags/v$NCCL_TESTS_VERSION.tar.gz | tar xvz \
&& cd nccl-tests-$NCCL_TESTS_VERSION \
&& make MPI=1 \
MPI_HOME=/opt/amazon/openmpi5/ \
CUDA_HOME=/usr/local/cuda \
Expand Down
13 changes: 11 additions & 2 deletions e2e2/test/images/nvidia/gpu_unit_tests/bash_unit
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,9 @@ run_tests() {
declare -F | "$GREP" ' setup$' >/dev/null && setup
__bash_unit_test_skipped__=$(mktemp)
trap "$RM -f \"$stdout\" \"$stderr\"" EXIT
if [[ -n "$skip_pattern" && ("$test" =~ $skip_pattern) ]]; then
skip "$test as specified in skip pattern: $skip_pattern"
fi
(__bash_unit_current_test__="$test" run_test) || status=$?
test -s $__bash_unit_test_skipped__ && status=0
declare -F | "$GREP" ' teardown$' >/dev/null && teardown
Expand All @@ -311,9 +314,10 @@ run_teardown_suite() {

usage() {
echo "$1" >&2
echo "$0 [-f <output format>] [-p <pattern1>] [-p <pattern2>] [-r] ... <test_file1> <test_file2> ..." >&2
echo "$0 [-f <output format>] [-p <pattern1>] [-p <pattern2>] [-s <skip_pattern>] [-r] ... <test_file1> <test_file2> ..." >&2
echo >&2
echo "Runs tests in test files that match <pattern>s" >&2
echo "Skip tests in test files that match <skip_pattern>s" >&2
echo "<output format> is optional only supported value is tap" >&2
echo "-r to execute test cases in random order" >&2
echo "-v to get current version information" >&2
Expand Down Expand Up @@ -533,16 +537,21 @@ tap_format() {

output_format=text
test_pattern=""
skip_pattern=""
trace_file=""
separator=""
randomise=0
while getopts "vp:t:f:r" option
while getopts "vp:t:f:r:s" option
do
case "$option" in
p)
test_pattern="${test_pattern}${separator}${OPTARG}"
separator="|"
;;
s)
skip_pattern="${skip_pattern}${separator}${OPTARG}"
separator="|"
;;
t)
trace_file="$(realpath ${OPTARG})"
truncate -s0 "$trace_file"
Expand Down
3 changes: 2 additions & 1 deletion e2e2/test/images/nvidia/gpu_unit_tests/unit_test
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@ TRACE_LOG=trace.log
TEST_TIMEOUT=1800
BASH="/usr/bin/bash"
CURRENT_DIR=$(pwd)
SKIP_TESTS_SUBCOMMAND=${SKIP_TESTS_SUBCOMMAND:-""}

timeout -k 10 ${TEST_TIMEOUT} ${BASH} gpu_unit_tests/bash_unit -f tap -t gpu_unit_tests/${TRACE_LOG} gpu_unit_tests/tests/*test*.sh
timeout -k 10 ${TEST_TIMEOUT} ${BASH} gpu_unit_tests/bash_unit -f tap ${SKIP_TESTS_SUBCOMMAND} -t gpu_unit_tests/${TRACE_LOG} gpu_unit_tests/tests/*test*.sh
Loading