Skip to content

Commit f560339

Browse files
authored
update: update r1.15.2 dockerfiles (#283)
* update for r1.15.2 * empty commit to restart CodeBuild tests * rename the folder * put 1.15.0 folder back * update sagemaker package name * add * to sagemaker package name
1 parent 59b2611 commit f560339

File tree

4 files changed

+564
-0
lines changed

4 files changed

+564
-0
lines changed

docker/1.15.2/py2/Dockerfile.cpu

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
FROM ubuntu:18.04
2+
3+
LABEL maintainer="Amazon AI"
4+
5+
# Prevent docker build get stopped by requesting user interaction
6+
ENV DEBIAN_FRONTEND=noninteractive
7+
ENV DEBCONF_NONINTERACTIVE_SEEN=true
8+
# Set environment variables for MKL
9+
# https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn
10+
ENV KMP_AFFINITY=granularity=fine,compact,1,0
11+
ENV KMP_BLOCKTIME=1
12+
ENV KMP_SETTINGS=0
13+
# Python won’t try to write .pyc or .pyo files on the import of source modules
14+
ENV PYTHONDONTWRITEBYTECODE=1
15+
ENV PYTHONUNBUFFERED=1
16+
# See http://bugs.python.org/issue19846
17+
ENV PYTHONIOENCODING=UTF-8
18+
ENV LANG=C.UTF-8
19+
ENV LC_ALL=C.UTF-8
20+
# Specify the location of module that contains the training logic for SageMaker
21+
# https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html
22+
ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main
23+
24+
# Define framework-related package sources
25+
ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_training*.tar.gz
26+
ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15.2/AmazonLinux/cpu/final/tensorflow-1.15.2-cp27-cp27mu-manylinux2010_x86_64.whl
27+
28+
RUN apt-get update \
29+
&& apt-get install -y --no-install-recommends \
30+
software-properties-common \
31+
build-essential \
32+
openssh-client \
33+
openssh-server \
34+
ca-certificates \
35+
curl \
36+
git \
37+
wget \
38+
vim \
39+
zlib1g-dev \
40+
&& rm -rf /var/lib/apt/lists/*
41+
42+
# Install Open MPI
43+
RUN mkdir /tmp/openmpi \
44+
&& cd /tmp/openmpi \
45+
&& curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \
46+
&& tar zxf openmpi-4.0.1.tar.gz \
47+
&& cd openmpi-4.0.1 \
48+
&& ./configure --enable-orterun-prefix-by-default \
49+
&& make -j $(nproc) all \
50+
&& make install \
51+
&& ldconfig \
52+
&& rm -rf /tmp/openmpi
53+
54+
# Create a wrapper for OpenMPI to allow running as root by default
55+
RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \
56+
&& echo '#!/bin/bash' > /usr/local/bin/mpirun \
57+
&& echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \
58+
&& chmod a+x /usr/local/bin/mpirun
59+
60+
RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \
61+
&& echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
62+
63+
ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
64+
ENV PATH=/usr/local/openmpi/bin/:$PATH
65+
66+
# SSH login fix. Otherwise user is kicked off after login
67+
RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
68+
69+
# Create SSH key.
70+
RUN mkdir -p /root/.ssh/ \
71+
&& mkdir -p /var/run/sshd \
72+
&& ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \
73+
&& cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
74+
&& printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config
75+
76+
WORKDIR /
77+
78+
RUN apt-get update \
79+
&& apt-get install -y \
80+
python \
81+
python-pip
82+
83+
COPY $FRAMEWORK_SUPPORT_INSTALLABLE .
84+
85+
RUN pip --no-cache-dir install --upgrade \
86+
pip \
87+
setuptools
88+
89+
# Some TF tools expect a "python" binary
90+
RUN ln -s $(which python) /usr/local/bin/python
91+
92+
RUN pip install --no-cache-dir -U \
93+
numpy==1.16.5 \
94+
scipy==1.2.2 \
95+
scikit-learn==0.20.3 \
96+
pandas==0.24.2 \
97+
Pillow==6.2.1 \
98+
h5py==2.9.0 \
99+
keras_applications==1.0.8 \
100+
keras_preprocessing==1.1.0 \
101+
requests==2.22.0 \
102+
keras==2.3.1 \
103+
mpi4py==3.0.2 \
104+
"cryptography>=2.3" \
105+
"sagemaker-tensorflow>=1.15,<1.16" \
106+
# Let's install TensorFlow separately in the end to avoid the library version to be overwritten
107+
&& pip install --force-reinstall --no-cache-dir -U \
108+
${TF_URL} \
109+
&& pip install --no-cache-dir -U \
110+
$FRAMEWORK_SUPPORT_INSTALLABLE \
111+
awscli\
112+
&& rm -f $FRAMEWORK_SUPPORT_INSTALLABLE \
113+
&& pip install --no-cache-dir -U \
114+
horovod==0.18.2
115+
116+
RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt
117+
118+
CMD ["bin/bash"]

docker/1.15.2/py2/Dockerfile.gpu

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
# Nvidia does not publish a TensorRT Runtime library for Ubuntu 18.04 with Cuda 10.1 support, so we stick with cuda 10.0.
2+
# https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/
3+
FROM nvidia/cuda:10.0-base-ubuntu18.04
4+
5+
LABEL maintainer="Amazon AI"
6+
7+
# Prevent docker build get stopped by requesting user interaction
8+
ENV DEBIAN_FRONTEND=noninteractive
9+
ENV DEBCONF_NONINTERACTIVE_SEEN=true
10+
# Python won’t try to write .pyc or .pyo files on the import of source modules
11+
ENV PYTHONDONTWRITEBYTECODE=1
12+
ENV PYTHONUNBUFFERED=1
13+
# See http://bugs.python.org/issue19846
14+
ENV PYTHONIOENCODING=UTF-8
15+
ENV LANG=C.UTF-8
16+
ENV LC_ALL=C.UTF-8
17+
# Specify the location of module that contains the training logic for SageMaker
18+
# https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html
19+
ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main
20+
21+
# Define framework-related package sources
22+
ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_training*.tar.gz
23+
ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15.2/AmazonLinux/gpu/final/tensorflow_gpu-1.15.2-cp27-cp27mu-manylinux2010_x86_64.whl
24+
25+
RUN apt-get update \
26+
&& apt-get install -y --no-install-recommends --allow-unauthenticated \
27+
ca-certificates \
28+
cuda-command-line-tools-10-0 \
29+
cuda-cublas-dev-10-0 \
30+
cuda-cudart-dev-10-0 \
31+
cuda-cufft-dev-10-0 \
32+
cuda-curand-dev-10-0 \
33+
cuda-cusolver-dev-10-0 \
34+
cuda-cusparse-dev-10-0 \
35+
curl \
36+
libcudnn7=7.5.1.10-1+cuda10.0 \
37+
# TensorFlow doesn't require libnccl anymore but Open MPI still depends on it
38+
libnccl2=2.4.7-1+cuda10.0 \
39+
libgomp1 \
40+
libnccl-dev=2.4.7-1+cuda10.0 \
41+
libfreetype6-dev \
42+
libhdf5-serial-dev \
43+
libpng-dev \
44+
libzmq3-dev \
45+
git \
46+
wget \
47+
vim \
48+
build-essential \
49+
openssh-client \
50+
openssh-server \
51+
zlib1g-dev \
52+
# The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0
53+
# adds a new list which contains libnvinfer library, so it needs another
54+
# 'apt-get update' to retrieve that list before it can actually install the library.
55+
# We don't install libnvinfer-dev since we don't need to build against TensorRT,
56+
# and libnvinfer4 doesn't contain libnvinfer.a static library.
57+
&& apt-get update \
58+
&& apt-get install -y --no-install-recommends --allow-unauthenticated \
59+
nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 \
60+
&& apt-get update \
61+
&& apt-get install -y --no-install-recommends --allow-unauthenticated \
62+
libnvinfer5=5.0.2-1+cuda10.0 \
63+
&& rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* \
64+
&& rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* \
65+
&& rm /usr/lib/x86_64-linux-gnu/libnvparsers* \
66+
&& rm -rf /var/lib/apt/lists/* \
67+
&& mkdir -p /var/run/sshd
68+
69+
# Install Open MPI
70+
RUN mkdir /tmp/openmpi \
71+
&& cd /tmp/openmpi \
72+
&& curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \
73+
&& tar zxf openmpi-4.0.1.tar.gz \
74+
&& cd openmpi-4.0.1 \
75+
&& ./configure --enable-orterun-prefix-by-default \
76+
&& make -j $(nproc) all \
77+
&& make install \
78+
&& ldconfig \
79+
&& rm -rf /tmp/openmpi
80+
81+
RUN apt-get update \
82+
&& apt-get install -y \
83+
python \
84+
python-pip
85+
86+
# Create a wrapper for OpenMPI to allow running as root by default
87+
RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \
88+
&& echo '#!/bin/bash' > /usr/local/bin/mpirun \
89+
&& echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \
90+
&& chmod a+x /usr/local/bin/mpirun
91+
92+
# Configure OpenMPI to run good defaults:
93+
# --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0
94+
RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \
95+
&& echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
96+
97+
# Set default NCCL parameters
98+
RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf
99+
100+
ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
101+
ENV PATH /usr/local/openmpi/bin/:$PATH
102+
ENV PATH=/usr/local/nvidia/bin:$PATH
103+
104+
# SSH login fix. Otherwise user is kicked off after login
105+
RUN mkdir -p /var/run/sshd \
106+
&& sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
107+
108+
# Create SSH key.
109+
RUN mkdir -p /root/.ssh/ \
110+
&& ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \
111+
&& cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
112+
&& printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config
113+
114+
WORKDIR /
115+
116+
RUN pip --no-cache-dir install --upgrade \
117+
pip \
118+
setuptools
119+
120+
# Some TF tools expect a "python" binary
121+
RUN ln -s $(which python) /usr/local/bin/python
122+
123+
COPY $FRAMEWORK_SUPPORT_INSTALLABLE .
124+
125+
RUN pip install --no-cache-dir -U \
126+
numpy==1.16.5 \
127+
scipy==1.2.2 \
128+
scikit-learn==0.20.3 \
129+
pandas==0.24.2 \
130+
Pillow==6.2.1 \
131+
h5py==2.9.0 \
132+
keras_applications==1.0.8 \
133+
keras_preprocessing==1.1.0 \
134+
requests==2.22.0 \
135+
keras==2.3.1 \
136+
mpi4py==3.0.2 \
137+
"cryptography>=2.3" \
138+
"sagemaker-tensorflow>=1.15,<1.16" \
139+
# Let's install TensorFlow separately in the end to avoid the library version to be overwritten
140+
&& pip install --force-reinstall --no-cache-dir -U \
141+
${TF_URL} \
142+
&& pip install --no-cache-dir -U \
143+
$FRAMEWORK_SUPPORT_INSTALLABLE \
144+
awscli\
145+
&& rm -f $FRAMEWORK_SUPPORT_INSTALLABLE
146+
147+
# Install Horovod, temporarily using CUDA stubs
148+
RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs \
149+
&& HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir \
150+
horovod==0.18.2 \
151+
&& ldconfig
152+
153+
# Allow OpenSSH to talk to containers without asking for confirmation
154+
RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \
155+
&& echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \
156+
&& mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
157+
158+
RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt
159+
160+
CMD ["bin/bash"]

0 commit comments

Comments
 (0)