|
| 1 | +FROM nvidia/cuda:10.1-base-ubuntu18.04 |
| 2 | + |
| 3 | +LABEL maintainer="Amazon AI" |
| 4 | + |
| 5 | +# prevent stopping by user interaction |
| 6 | +ENV DEBIAN_FRONTEND noninteractive |
| 7 | +ENV DEBCONF_NONINTERACTIVE_SEEN true |
| 8 | +ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main |
| 9 | + |
| 10 | +ENV PYTHONDONTWRITEBYTECODE=1 |
| 11 | +ENV PYTHONUNBUFFERED=1 |
| 12 | +ENV PYTHONIOENCODING=UTF-8 |
| 13 | +ENV LANG=C.UTF-8 |
| 14 | +ENV LC_ALL=C.UTF-8 |
| 15 | + |
| 16 | +ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_training*.tar.gz |
| 17 | +ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/2.1/AmazonLinux/gpu/final/tensorflow_gpu-2.1.0-cp27-cp27mu-manylinux2010_x86_64.whl |
| 18 | + |
| 19 | +ARG PYTHON=python |
| 20 | +ARG PYTHON_PIP=python-pip |
| 21 | +ARG PIP=pip |
| 22 | + |
| 23 | +RUN apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ |
| 24 | + ca-certificates \ |
| 25 | + cuda-command-line-tools-10-1 \ |
| 26 | + cuda-cudart-dev-10-1 \ |
| 27 | + cuda-cufft-dev-10-1 \ |
| 28 | + cuda-curand-dev-10-1 \ |
| 29 | + cuda-cusolver-dev-10-1 \ |
| 30 | + cuda-cusparse-dev-10-1 \ |
| 31 | + curl \ |
| 32 | + libcudnn7=7.6.2.24-1+cuda10.1 \ |
| 33 | + # TensorFlow doesn't require libnccl anymore but Open MPI still depends on it |
| 34 | + libnccl2=2.4.7-1+cuda10.1 \ |
| 35 | + libgomp1 \ |
| 36 | + libnccl-dev=2.4.7-1+cuda10.1 \ |
| 37 | + libfreetype6-dev \ |
| 38 | + libhdf5-serial-dev \ |
| 39 | + libpng-dev \ |
| 40 | + libzmq3-dev \ |
| 41 | + git \ |
| 42 | + wget \ |
| 43 | + vim \ |
| 44 | + build-essential \ |
| 45 | + openssh-client \ |
| 46 | + openssh-server \ |
| 47 | + zlib1g-dev \ |
| 48 | + # Install dependent library for OpenCV |
| 49 | + libgtk2.0-dev \ |
| 50 | + #cuda-cublas-dev not available with 10-1, install libcublas instead |
| 51 | + #it will downgrade the cublas from 10-2 to 10-1 |
| 52 | + #adding an extra flag --allow-downgrades for it |
| 53 | + && apt-get update \ |
| 54 | + && apt-get install -y --no-install-recommends --allow-unauthenticated --allow-downgrades \ |
| 55 | + libcublas10=10.1.0.105-1 \ |
| 56 | + libcublas-dev=10.1.0.105-1 \ |
| 57 | + # The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 |
| 58 | + # adds a new list which contains libnvinfer library, so it needs another |
| 59 | + # 'apt-get update' to retrieve that list before it can actually install the |
| 60 | + # library. |
| 61 | + # We don't install libnvinfer-dev since we don't need to build against TensorRT, |
| 62 | + # and libnvinfer4 doesn't contain libnvinfer.a static library. |
| 63 | + # nvinfer-runtime-trt-repo doesn't have a 1804-cuda10.1 version yet. see: |
| 64 | + # https://developer.download.nvidia.cn/compute/machine-learning/repos/ubuntu1804/x86_64/ |
| 65 | + && apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ |
| 66 | + nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 \ |
| 67 | + && apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ |
| 68 | + libnvinfer6=6.0.1-1+cuda10.1 \ |
| 69 | + && rm -rf /var/lib/apt/lists/* \ |
| 70 | + && mkdir -p /var/run/sshd |
| 71 | + |
| 72 | +# Install Open MPI |
| 73 | +RUN mkdir /tmp/openmpi \ |
| 74 | + && cd /tmp/openmpi \ |
| 75 | + && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ |
| 76 | + && tar zxf openmpi-4.0.1.tar.gz \ |
| 77 | + && cd openmpi-4.0.1 \ |
| 78 | + && ./configure --enable-orterun-prefix-by-default \ |
| 79 | + && make -j $(nproc) all \ |
| 80 | + && make install \ |
| 81 | + && ldconfig \ |
| 82 | + && rm -rf /tmp/openmpi |
| 83 | + |
| 84 | +RUN apt-get update && apt-get install -y \ |
| 85 | + ${PYTHON} \ |
| 86 | + ${PYTHON_PIP} |
| 87 | + |
| 88 | +# Create a wrapper for OpenMPI to allow running as root by default |
| 89 | +RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ |
| 90 | + && echo '#!/bin/bash' > /usr/local/bin/mpirun \ |
| 91 | + && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ |
| 92 | + && chmod a+x /usr/local/bin/mpirun |
| 93 | + |
| 94 | +# Configure OpenMPI to run good defaults: |
| 95 | +# --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0 |
| 96 | +RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ |
| 97 | + && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf |
| 98 | + |
| 99 | +# Set default NCCL parameters |
| 100 | +RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf |
| 101 | + |
| 102 | +ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH |
| 103 | +ENV PATH /usr/local/openmpi/bin/:$PATH |
| 104 | +ENV PATH=/usr/local/nvidia/bin:$PATH |
| 105 | + |
| 106 | +# SSH login fix. Otherwise user is kicked off after login |
| 107 | +RUN mkdir -p /var/run/sshd \ |
| 108 | + && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd |
| 109 | + |
| 110 | +# Create SSH key. |
| 111 | +RUN mkdir -p /root/.ssh/ \ |
| 112 | + && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ |
| 113 | + && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ |
| 114 | + && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config |
| 115 | + |
| 116 | +WORKDIR / |
| 117 | + |
| 118 | +RUN ${PIP} --no-cache-dir install --upgrade \ |
| 119 | + pip \ |
| 120 | + setuptools |
| 121 | + |
| 122 | +# Some TF tools expect a "python" binary |
| 123 | +RUN ln -s $(which ${PYTHON}) /usr/local/bin/python |
| 124 | + |
| 125 | +COPY $FRAMEWORK_SUPPORT_INSTALLABLE . |
| 126 | + |
| 127 | +# install PyYAML==5.1.2 to avoid conflict with latest awscli |
| 128 | +# # python-dateutil==2.8.0 to satisfy botocore associated with latest awscli |
| 129 | +RUN ${PIP} install --no-cache-dir -U \ |
| 130 | + numpy==1.16.6 \ |
| 131 | + scipy==1.2.2 \ |
| 132 | + scikit-learn==0.20.4 \ |
| 133 | + pandas==0.24.2 \ |
| 134 | + Pillow==6.2.2 \ |
| 135 | + h5py==2.10.0 \ |
| 136 | + keras_applications==1.0.8 \ |
| 137 | + keras_preprocessing==1.1.0 \ |
| 138 | + keras==2.3.1 \ |
| 139 | + python-dateutil==2.8.1 \ |
| 140 | + pyYAML==5.2 \ |
| 141 | + requests==2.22.0 \ |
| 142 | + awscli \ |
| 143 | + mpi4py==3.0.3 \ |
| 144 | + opencv-python==4.2.0.32 \ |
| 145 | + "cryptography>=2.3" \ |
| 146 | + "sagemaker-tensorflow>=2.0,<2.1" \ |
| 147 | + # Let's install TensorFlow separately in the end to avoid |
| 148 | + # the library version to be overwritten |
| 149 | + && ${PIP} install --no-cache-dir -U \ |
| 150 | + ${TF_URL} \ |
| 151 | + && ${PIP} install --no-cache-dir -U \ |
| 152 | + $FRAMEWORK_SUPPORT_INSTALLABLE \ |
| 153 | + && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE |
| 154 | + |
| 155 | +# Install Horovod, temporarily using CUDA stubs |
| 156 | +RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs \ |
| 157 | + && HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 ${PIP} install --no-cache-dir horovod==0.18.2 \ |
| 158 | + && ldconfig |
| 159 | + |
| 160 | +# Allow OpenSSH to talk to containers without asking for confirmation |
| 161 | +RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \ |
| 162 | + && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \ |
| 163 | + && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config |
| 164 | + |
| 165 | +RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow-2.1/license.txt -o /license.txt |
| 166 | + |
| 167 | +CMD ["bin/bash"] |
0 commit comments