Update gpu flavor to cuda 11.2

lukasmasuch · lukasmasuch · commit d7ea5913ea38 · 2021-06-28T17:32:53.000Z
diff --git a/gpu-flavor/Dockerfile b/gpu-flavor/Dockerfile
@@ -8,26 +8,27 @@ ENV WORKSPACE_FLAVOR=$ARG_WORKSPACE_FLAVOR
 USER root
 
 ### NVIDIA CUDA BASE ###
-# https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/10.1/ubuntu18.04-x86_64/base/Dockerfile
-RUN apt-get update && apt-get install -y --no-install-recommends gnupg2 curl ca-certificates && \
-    curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub | apt-key add - && \
-    echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
-    echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \
+# https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/11.2.2/ubuntu20.04-x86_64/base/Dockerfile
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    gnupg2 curl ca-certificates && \
+    curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub | apt-key add - && \
+    echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
+    echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \
     # Cleanup - cannot use cleanup script here, otherwise too much is removed
     apt-get clean && \
     rm -rf $HOME/.cache/* && \
     rm -rf /tmp/* && \
     rm -rf /var/lib/apt/lists/*
 
-ENV CUDA_VERSION 11.2.1
-ENV CUDA_PKG_VERSION 11-2=$CUDA_VERSION-1
-ENV CUDART_VERSION 11-2=$CUDA_VERSION46-1
+ENV CUDA_VERSION 11.2.2
+#ENV CUDA_PKG_VERSION 11-2=$CUDA_VERSION-1
+#ENV CUDART_VERSION 11-2=$CUDA_VERSION46-1
 
 # For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a
 RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-cudart-$CUDART_VERSION \
-        cuda-compat-11-2 && \
-    ln -s cuda-11.2 /usr/local/cuda && \
+    cuda-cudart-11-2=11.2.152-1 \
+    cuda-compat-11-2 \
+    && ln -s cuda-11.2 /usr/local/cuda && \
     rm -rf /var/lib/apt/lists/* && \
     # Cleanup - cannot use cleanup script here, otherwise too much is removed
     apt-get clean && \
@@ -36,111 +37,101 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     rm -rf /var/lib/apt/lists/*
 
 # Required for nvidia-docker v1
-RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
-    echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
+RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf \
+    && echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
 
 ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
-ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}
+ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
 
 # nvidia-container-runtime
 # https://github.com/NVIDIA/nvidia-container-runtime#environment-variables-oci-spec
 # nvidia-container-runtime
 ENV NVIDIA_VISIBLE_DEVICES all
 ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
-ENV NVIDIA_REQUIRE_CUDA "cuda>=10.1 brand=tesla,driver>=396,driver<397 brand=tesla,driver>=410,driver<411 brand=tesla,driver>=418,driver<419"
+ENV NVIDIA_REQUIRE_CUDA "cuda>=11.2 brand=tesla,driver>=418,driver<419 brand=tesla,driver>=440,driver<441 driver>=450"
 
 ### CUDA RUNTIME ###
-# https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/10.1/ubuntu18.04-x86_64/runtime/Dockerfile
+# https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/11.2.2/ubuntu20.04-x86_64/runtime/Dockerfile
 
 ENV NCCL_VERSION 2.8.4
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-libraries-$CUDA_PKG_VERSION \
-        libnpp-11-2=11.3.2.139-1 \
-        cuda-nvtx-11-2=11.2.67-1 \
-        libcublas-11-2=11.4.1.1026-1 \
-        libcusparse-11-2=11.4.0.135-1 \
-        libnccl2=$NCCL_VERSION-1+cuda11.2 \
-    && apt-mark hold libnccl2 \
+    cuda-libraries-11-2=11.2.2-1 \
+    libnpp-11-2=11.3.2.152-1 \
+    cuda-nvtx-11-2=11.2.152-1 \
+    libcublas-11-2=11.4.1.1043-1 \
+    libcusparse-11-2=11.4.1.1152-1 \
+    libnccl2=$NCCL_VERSION-1+cuda11.2 \
+    && rm -rf /var/lib/apt/lists/* \
     # Cleanup - cannot use cleanup script here, otherwise too much is removed
     && apt-get clean \
     && rm -rf $HOME/.cache/* \
     && rm -rf /tmp/* \
     && rm -rf /var/lib/apt/lists/*
 
-# apt from auto upgrading the cublas package. See https://gitlab.com/nvidia/container-images/cuda/-/issues/88
-RUN apt-mark hold libcublas10
+RUN apt-mark hold libcublas-11-2 libnccl2
 
 ### END CUDA RUNTIME ###
 
 ### CUDA DEVEL ###
-# https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/10.1/ubuntu18.04-x86_64/devel/Dockerfile
+# https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/11.2.2/ubuntu20.04-x86_64/devel/Dockerfile
 RUN apt-get update && apt-get install -y --no-install-recommends \
-        libtinfo5 libncursesw5 \
-        cuda-cudart-dev-$CUDART_VERSION \
-        cuda-nvml-dev-11-2=11.2.67-1 \
-        cuda-command-line-tools-$CUDA_PKG_VERSION \
-        libnpp-dev-11-2=11.3.2.139-1 \
-        cuda-libraries-dev-$CUDA_PKG_VERSION \
-        cuda-minimal-build-$CUDA_PKG_VERSION \
-        libcublas-dev-11-2=11.4.1.1026-1 \
-        libcusparse-dev-11-2=11.4.0.135-1 \
-        libnpp-dev-11-2=11.3.2.139-1 \
-        libnccl-dev=$NCCL_VERSION-1+cuda11.2 && \
-    apt-mark hold libnccl-dev && \
+    libtinfo5 libncursesw5 \
+    cuda-cudart-dev-11-2=11.2.152-1 \
+    cuda-command-line-tools-11-2=11.2.2-1 \
+    cuda-minimal-build-11-2=11.2.2-1 \
+    cuda-libraries-dev-11-2=11.2.2-1 \
+    cuda-nvml-dev-11-2=11.2.152-1 \
+    libnpp-dev-11-2=11.3.2.152-1 \
+    libnccl-dev=2.8.4-1+cuda11.2 \
+    libcublas-dev-11-2=11.4.1.1043-1 \
+    libcusparse-dev-11-2=11.4.1.1152-1 && \
     # Cleanup - cannot use cleanup script here, otherwise too much is removed
     apt-get clean && \
     rm -rf $HOME/.cache/* && \
     rm -rf /tmp/* && \
     rm -rf /var/lib/apt/lists/*
 
 # apt from auto upgrading the cublas package. See https://gitlab.com/nvidia/container-images/cuda/-/issues/88
-RUN apt-mark hold libcublas-dev
-
+RUN apt-mark hold libcublas-dev-11-2 libnccl-dev
 ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
 
+
 ### END CUDA DEVEL ###
 
-### CUDANN7 DEVEL ###
-# https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/10.1/ubuntu18.04-x86_64/devel/cudnn7/Dockerfile
+### CUDANN8 DEVEL ###
+# https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/11.2.2/ubuntu20.04-x86_64/devel/cudnn8/Dockerfile
 
-ENV CUDNN_VERSION 8.1.0.77
+ENV CUDNN_VERSION 8.1.1.33
 LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}"
 
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-            libcudnn7=$CUDNN_VERSION-1+cuda11.2 \
-            libcudnn7-dev=$CUDNN_VERSION-1+cuda11.2 && \
-    apt-mark hold libcudnn8 && \
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libcudnn8=$CUDNN_VERSION-1+cuda11.2 \
+    libcudnn8-dev=$CUDNN_VERSION-1+cuda11.2 \
+    && apt-mark hold libcudnn8 && \
     # Cleanup
     apt-get clean && \
     rm -rf /root/.cache/* && \
     rm -rf /tmp/* && \
     rm -rf /var/lib/apt/lists/*
 
-### END CUDANN7 ###
+### END CUDANN8 ###
 
 # Link Cupti:
 ENV LD_LIBRARY_PATH ${LD_LIBRARY_PATH}:/usr/local/cuda/extras/CUPTI/lib64
 
-# Install TensorRT. Requires that libcudnn8 is installed above.
-# https://www.tensorflow.org/install/gpu#ubuntu_1804_cuda_101
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        libnvinfer7=7.1.3-1+cuda11.0 \
-        libnvinfer-dev=7.1.3-1+cuda11.0 \
-        libnvinfer-plugin7=7.1.3-1+cuda11.0 && \
-    # Cleanup
-    clean-layer.sh
-
 ### GPU DATA SCIENCE LIBRARIES ###
 
 RUN \
     apt-get update && \
     apt-get install -y libomp-dev libopenblas-base && \
-    # Not needed? Install cuda-toolkit (e.g. for pytorch: https://pytorch.org/): https://anaconda.org/anaconda/cudatoolkit
-    conda install -y cudatoolkit=11.0.221 -c pytorch && \
+    # Install pytorch gpu
+    # uninstall cpu only packages via conda
+    conda remove --force -y pytorch cpuonly && \
+    # https://pytorch.org/get-started/locally/
+    conda install pytorch cudatoolkit=11.2 -c pytorch -c nvidia && \
     # Install cupy: https://cupy.chainer.org/
-    pip install --no-cache-dir cupy-cuda112 && \
+    pip install --no-cache-dir cupy-cuda111 && \
     # Install pycuda: https://pypi.org/project/pycuda
     pip install --no-cache-dir pycuda && \
     # Install gpu utils libs
@@ -149,25 +140,19 @@ RUN \
     pip install --no-cache-dir scikit-cuda && \
     # Install tensorflow gpu
     pip uninstall -y tensorflow tensorflow-cpu intel-tensorflow && \
-    # TODO: tensorflow 2.3.1 installs tenorboard 2.4.0 with problems, use 2.3.0
-    pip install --no-cache-dir tensorflow-gpu==2.4.1 && \
+    pip install --no-cache-dir tensorflow-gpu==2.5.0 && \
     # Install ONNX GPU Runtime
-    # TODO: 1.4.x is latest with cuda 10.1 support
     pip uninstall -y onnxruntime && \
-    pip install --no-cache-dir onnxruntime-gpu==1.7.0 && \
-    # Install pytorch gpu
-    # uninstall cpu only packages via conda
-    conda remove --force -y pytorch cpuonly && \
-    # https://pytorch.org/get-started/locally/
-    conda install -y pytorch -c pytorch && \
+    pip install --no-cache-dir onnxruntime-gpu==1.8.0 onnxruntime-training==1.8.0 && \
     # Install faiss gpu
     conda remove --force -y faiss-cpu && \
     conda install -y faiss-gpu -c pytorch && \
     # Update mxnet to gpu edition
     pip uninstall -y mxnet-mkl && \
-    pip install --no-cache-dir mxnet-cu101mkl==1.6.0.post0 && \
+    # cuda111 -> >= 11.1
+    pip install --no-cache-dir mxnet-cu110 && \
     # install jax: https://github.com/google/jax#pip-installation
-    pip install --upgrade jax jaxlib==0.1.62+cuda110 -f https://storage.googleapis.com/jax-releases/jax_releases.html  && \
+    pip install --upgrade jax[cuda111] -f https://storage.googleapis.com/jax-releases/jax_releases.html && \
     # Install pygpu - Required for theano: http://deeplearning.net/software/libgpuarray/
     conda install -y pygpu && \
     # Install lightgbm
@@ -182,19 +167,6 @@ RUN \
     # Cleanup
     clean-layer.sh
 
-# TODO: nvdashboard does not work with relative paths
-# RUN \
-#     # Install Jupyterlab GPU Plugin: https://github.com/rapidsai/jupyterlab-nvdashboard
-#     pip install jupyterlab-nvdashboard && \
-#     jupyter labextension install jupyterlab-nvdashboard && \
-#     # Clean jupyter lab cache: https://github.com/jupyterlab/jupyterlab/issues/4930
-#     jupyter lab clean && \
-#     jlpm cache clean && \
-#     # Remove build folder -> should be remove by lab clean as well?
-#     rm -rf $CONDA_ROOT/share/jupyter/lab/staging && \
-#     # Cleanup
-#     clean-layer.sh
-
 # TODO install DALI: https://docs.nvidia.com/deeplearning/dali/user-guide/docs/installation.html#dali-and-ngc
 # TODO: if > Ubuntu 19.04 -> install nvtop: https://github.com/Syllo/nvtop
 # TODO: Install Arrrayfire: https://arrayfire.com/download/ pip install --no-cache-dir arrayfire && \