Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
147 changes: 74 additions & 73 deletions e2e2/test/images/nvidia/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
ARG UBUNTU_MAJOR_VERSION=22

ARG CUDA_MAJOR_VERSION=12
ARG CUDA_MINOR_VERSION=5

# Start with the NVIDIA CUDA base image
FROM nvidia/cuda:12.5.0-devel-ubuntu22.04
FROM nvidia/cuda:${CUDA_MAJOR_VERSION}.${CUDA_MINOR_VERSION}.1-devel-ubuntu${UBUNTU_MAJOR_VERSION}.04

ARG EFA_INSTALLER_VERSION=latest
# 1.7.4+ is required, to enforce proper EFA function with OFI_NCCL_DISABLE_GDR_REQUIRED_CHECK=0
ARG AWS_OFI_NCCL_VERSION=1.9.1
ARG NCCL_TESTS_VERSION=master
ARG UBUNTU_MAJOR_VERSION
ARG CUDA_MAJOR_VERSION
ARG CUDA_MINOR_VERSION

ENV DEBIAN_FRONTEND=noninteractive

# Install necessary dependencies
RUN apt-get update -y
RUN apt-get remove -y --allow-change-held-packages \
RUN apt update -y \
&& apt remove -y --allow-change-held-packages \
libmlx5-1 \
ibverbs-utils \
libibverbs-dev \
Expand All @@ -17,84 +23,79 @@ RUN apt-get remove -y --allow-change-held-packages \
libnccl-dev

RUN rm -rf /opt/hpcx \
&& rm -rf /usr/local/mpi \
&& rm -rf /usr/local/ucx \
&& rm -f /etc/ld.so.conf.d/hpcx.conf \
&& ldconfig
&& rm -rf /usr/local/mpi \
&& rm -rf /usr/local/ucx \
&& rm -f /etc/ld.so.conf.d/hpcx.conf \
&& ldconfig

RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
sudo \
git \
gcc \
vim \
kmod \
openssh-client \
openssh-server \
build-essential \
wget curl \
autoconf \
libtool \
gdb \
automake \
python3-distutils \
cmake \
apt-utils \
devscripts \
debhelper \
libsubunit-dev \
check \
pkg-config \
libhwloc-dev \
datacenter-gpu-manager \
cloud-utils \
cuda-demo-suite-12-5
RUN apt install -y \
git \
gcc \
openssh-client \
openssh-server \
build-essential \
curl \
autoconf \
libtool \
automake \
cmake \
apt-utils \
libhwloc-dev \
cuda-demo-suite-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}

RUN mkdir -p /var/run/sshd
RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config
ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib64:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib64:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH
RUN mkdir -p /var/run/sshd \
&& sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config \
&& echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \
&& sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config

ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH
ENV PATH /usr/local/cuda/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/sbin:/usr/bin:/usr/local/bin:$PATH

# Install EFA
RUN cd $HOME \
&& curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
&& tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
&& cd aws-efa-installer \
&& ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
&& rm -rf $HOME/aws-efa-installer
ARG EFA_INSTALLER_VERSION=latest
RUN cd /tmp \
&& curl -sL https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz | tar xvz \
&& cd aws-efa-installer \
&& ./efa_installer.sh --yes --enable-gdr --skip-kmod --skip-limit-conf --no-verify --mpi openmpi5 \
&& rm -rf /tmp/* \
/var/lib/apt/lists/*

# Install NCCL
RUN apt-key del 7fa2af80 \
&& curl -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb \
&& dpkg -i cuda-keyring_1.0-1_all.deb \
&& sudo apt install libnccl2=2.18.5-1+cuda12.2 libnccl-dev=2.18.5-1+cuda12.2
ARG NCCL_VERSION=2.22.3-1+cuda${CUDA_MAJOR_VERSION}.${CUDA_MINOR_VERSION}
RUN apt update \
&& apt install -y \
libnccl2=${NCCL_VERSION} \
libnccl-dev=${NCCL_VERSION}

## Install AWS-OFI-NCCL plugin
RUN export OPAL_PREFIX="" \
&& git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl \
&& cd /opt/aws-ofi-nccl \
&& git checkout v${AWS_OFI_NCCL_VERSION}-aws \
&& ./autogen.sh \
&& ./configure --prefix=/opt/aws-ofi-nccl/install \
--with-libfabric=/opt/amazon/efa/ \
--with-cuda=/usr/local/cuda \
--with-mpi=/opt/amazon/openmpi/ \
&& make && make install

# Install NCCL Tests
RUN git clone https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \
&& cd /opt/nccl-tests \
&& git checkout ${NCCL_TESTS_VERSION} \
&& make MPI=1 \
MPI_HOME=/opt/amazon/openmpi/ \
CUDA_HOME=/usr/local/cuda
# Install AWS-OFI-NCCL plugin
ARG AWS_OFI_NCCL_VERSION=1.11.0-aws
RUN cd tmp \
&& curl -sL https://github.com/aws/aws-ofi-nccl/releases/download/v${AWS_OFI_NCCL_VERSION}/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION}.tar.gz | tar xvz \
&& cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION} \
&& ./configure --prefix=/opt/aws-ofi-nccl/install \
--with-mpi=/opt/amazon/openmpi \
--with-libfabric=/opt/amazon/efa \
--with-cuda=/usr/local/cuda \
--enable-platform-aws \
--disable-tests \
&& make -j $(nproc) \
&& make install

# Install NCCL Tests
ARG NCCL_TESTS_VERSION=2.13.10
RUN cd /tmp \
&& curl -sL https://github.com/NVIDIA/nccl-tests/archive/refs/tags/v${NCCL_TESTS_VERSION}.tar.gz | tar xvz \
&& cd nccl-tests-${NCCL_TESTS_VERSION} \
&& make MPI=1 \
MPI_HOME=/opt/amazon/openmpi5/ \
CUDA_HOME=/usr/local/cuda \
&& mkdir -p /opt/nccl-tests \
&& cp -r build /opt/nccl-tests/build \
&& rm -rf /tmp/*

# Set a default command for debugging or modify as per requirements
ENV NCCL_PROTO simple
RUN rm -rf /var/lib/apt/lists/*
ENV LD_PRELOAD /usr/lib/x86_64-linux-gnu/libnccl.so:$LD_PRELOAD

COPY e2e2/test/images/nvidia/gpu_unit_tests ./gpu_unit_tests
RUN chmod +x ./gpu_unit_tests/unit_test
RUN chmod +x ./gpu_unit_tests/unit_test
Loading