|
| 1 | +FROM nvidia/cuda:10.0-base-ubuntu18.04 |
| 2 | + |
| 3 | +LABEL maintainer="Amazon AI" |
| 4 | + |
| 5 | +# prevent stopping by user interaction |
| 6 | +ENV DEBIAN_FRONTEND noninteractive |
| 7 | +ENV DEBCONF_NONINTERACTIVE_SEEN true |
| 8 | +ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main |
| 9 | + |
| 10 | +ENV PYTHONDONTWRITEBYTECODE=1 |
| 11 | +ENV PYTHONUNBUFFERED=1 |
| 12 | +ENV PYTHONIOENCODING=UTF-8 |
| 13 | +ENV LANG=C.UTF-8 |
| 14 | +ENV LC_ALL=C.UTF-8 |
| 15 | + |
| 16 | +ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_training*.tar.gz |
| 17 | +ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/2.0.1/AmazonLinux/gpu/final/tensorflow_gpu-2.0.1-cp27-cp27mu-manylinux2010_x86_64.whl |
| 18 | + |
| 19 | +ARG PYTHON=python |
| 20 | +ARG PYTHON_PIP=python-pip |
| 21 | +ARG PIP=pip |
| 22 | + |
| 23 | +RUN apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ |
| 24 | + ca-certificates \ |
| 25 | + cuda-command-line-tools-10-0 \ |
| 26 | + cuda-cublas-dev-10-0 \ |
| 27 | + cuda-cudart-dev-10-0 \ |
| 28 | + cuda-cufft-dev-10-0 \ |
| 29 | + cuda-curand-dev-10-0 \ |
| 30 | + cuda-cusolver-dev-10-0 \ |
| 31 | + cuda-cusparse-dev-10-0 \ |
| 32 | + curl \ |
| 33 | + libcudnn7=7.5.1.10-1+cuda10.0 \ |
| 34 | + # TensorFlow doesn't require libnccl anymore but Open MPI still depends on it |
| 35 | + libnccl2=2.4.7-1+cuda10.0 \ |
| 36 | + libgomp1 \ |
| 37 | + libnccl-dev=2.4.7-1+cuda10.0 \ |
| 38 | + libfreetype6-dev \ |
| 39 | + libhdf5-serial-dev \ |
| 40 | + libpng-dev \ |
| 41 | + libzmq3-dev \ |
| 42 | + git \ |
| 43 | + wget \ |
| 44 | + vim \ |
| 45 | + build-essential \ |
| 46 | + openssh-client \ |
| 47 | + openssh-server \ |
| 48 | + zlib1g-dev \ |
| 49 | + # Install dependent library for OpenCV |
| 50 | + libgtk2.0-dev \ |
| 51 | + # The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 |
| 52 | + # adds a new list which contains libnvinfer library, so it needs another |
| 53 | + # 'apt-get update' to retrieve that list before it can actually install the |
| 54 | + # library. |
| 55 | + # We don't install libnvinfer-dev since we don't need to build against TensorRT, |
| 56 | + # and libnvinfer4 doesn't contain libnvinfer.a static library. |
| 57 | + && apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ |
| 58 | + nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 \ |
| 59 | + && apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ |
| 60 | + libnvinfer5=5.0.2-1+cuda10.0 \ |
| 61 | + && rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* \ |
| 62 | + && rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* \ |
| 63 | + && rm /usr/lib/x86_64-linux-gnu/libnvparsers* \ |
| 64 | + && rm -rf /var/lib/apt/lists/* \ |
| 65 | + && mkdir -p /var/run/sshd |
| 66 | + |
| 67 | +# Install Open MPI |
| 68 | +RUN mkdir /tmp/openmpi \ |
| 69 | + && cd /tmp/openmpi \ |
| 70 | + && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ |
| 71 | + && tar zxf openmpi-4.0.1.tar.gz \ |
| 72 | + && cd openmpi-4.0.1 \ |
| 73 | + && ./configure --enable-orterun-prefix-by-default \ |
| 74 | + && make -j $(nproc) all \ |
| 75 | + && make install \ |
| 76 | + && ldconfig \ |
| 77 | + && rm -rf /tmp/openmpi |
| 78 | + |
| 79 | +RUN apt-get update && apt-get install -y \ |
| 80 | + ${PYTHON} \ |
| 81 | + ${PYTHON_PIP} |
| 82 | + |
| 83 | +# Create a wrapper for OpenMPI to allow running as root by default |
| 84 | +RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ |
| 85 | + && echo '#!/bin/bash' > /usr/local/bin/mpirun \ |
| 86 | + && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ |
| 87 | + && chmod a+x /usr/local/bin/mpirun |
| 88 | + |
| 89 | +# Configure OpenMPI to run good defaults: |
| 90 | +# --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0 |
| 91 | +RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ |
| 92 | + && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf |
| 93 | + |
| 94 | +# Set default NCCL parameters |
| 95 | +RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf |
| 96 | + |
| 97 | +ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH |
| 98 | +ENV PATH /usr/local/openmpi/bin/:$PATH |
| 99 | +ENV PATH=/usr/local/nvidia/bin:$PATH |
| 100 | + |
| 101 | +# SSH login fix. Otherwise user is kicked off after login |
| 102 | +RUN mkdir -p /var/run/sshd \ |
| 103 | + && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd |
| 104 | + |
| 105 | +# Create SSH key. |
| 106 | +RUN mkdir -p /root/.ssh/ \ |
| 107 | + && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ |
| 108 | + && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ |
| 109 | + && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config |
| 110 | + |
| 111 | +WORKDIR / |
| 112 | + |
| 113 | +RUN ${PIP} --no-cache-dir install --upgrade \ |
| 114 | + pip \ |
| 115 | + setuptools |
| 116 | + |
| 117 | +# Some TF tools expect a "python" binary |
| 118 | +RUN ln -s $(which ${PYTHON}) /usr/local/bin/python |
| 119 | + |
| 120 | +COPY $FRAMEWORK_SUPPORT_INSTALLABLE . |
| 121 | + |
| 122 | +# install PyYAML==5.1.2 to avoid conflict with latest awscli |
| 123 | +# # python-dateutil==2.8.0 to satisfy botocore associated with latest awscli |
| 124 | +RUN ${PIP} install --no-cache-dir -U \ |
| 125 | + numpy==1.16.5 \ |
| 126 | + scipy==1.2.2 \ |
| 127 | + scikit-learn==0.20.4 \ |
| 128 | + pandas==0.24.2 \ |
| 129 | + Pillow==6.2.1 \ |
| 130 | + h5py==2.10.0 \ |
| 131 | + keras_applications==1.0.8 \ |
| 132 | + keras_preprocessing==1.1.0 \ |
| 133 | + requests==2.22.0 \ |
| 134 | + keras==2.3.1 \ |
| 135 | + python-dateutil==2.8.0 \ |
| 136 | + PyYAML==5.1.2 \ |
| 137 | + awscli \ |
| 138 | + mpi4py==3.0.3 \ |
| 139 | + opencv-python==4.2.0.32 \ |
| 140 | + "cryptography>=2.3" \ |
| 141 | + "sagemaker-tensorflow>=2.0,<2.1" \ |
| 142 | + # Let's install TensorFlow separately in the end to avoid |
| 143 | + # the library version to be overwritten |
| 144 | + && ${PIP} install --no-cache-dir -U \ |
| 145 | + ${TF_URL} \ |
| 146 | + && ${PIP} install --no-cache-dir -U \ |
| 147 | + $FRAMEWORK_SUPPORT_INSTALLABLE \ |
| 148 | + && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE |
| 149 | + |
| 150 | +# Install Horovod, temporarily using CUDA stubs |
| 151 | +RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs \ |
| 152 | + && HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 ${PIP} install --no-cache-dir horovod==0.18.2 \ |
| 153 | + && ldconfig |
| 154 | + |
| 155 | +# Allow OpenSSH to talk to containers without asking for confirmation |
| 156 | +RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \ |
| 157 | + && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \ |
| 158 | + && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config |
| 159 | + |
| 160 | +RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow-2.0.1/license.txt -o /license.txt |
| 161 | + |
| 162 | +CMD ["bin/bash"] |
0 commit comments