Skip to content

Commit 8a1d656

Browse files
authored
change: tensorflow-2.0 tests
* fix: tensorflow-2.0 library code changes * remove >=2.0 off tensorflow restrictions * fix: update mnist scripts for tf-2.0 * add dockerfiles
1 parent 40cf5b4 commit 8a1d656

File tree

6 files changed

+558
-19
lines changed

6 files changed

+558
-19
lines changed

docker/2.0.0/py2/Dockerfile.cpu

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
FROM ubuntu:18.04
2+
3+
LABEL maintainer="Amazon AI"
4+
5+
# prevent stopping by user interaction
6+
ENV DEBIAN_FRONTEND noninteractive
7+
ENV DEBCONF_NONINTERACTIVE_SEEN true
8+
9+
RUN apt-get update && apt-get install -y --no-install-recommends \
10+
software-properties-common \
11+
build-essential \
12+
openssh-client \
13+
openssh-server \
14+
ca-certificates \
15+
curl \
16+
git \
17+
wget \
18+
vim \
19+
zlib1g-dev \
20+
&& rm -rf /var/lib/apt/lists/*
21+
22+
# Install Open MPI
23+
RUN mkdir /tmp/openmpi && \
24+
cd /tmp/openmpi && \
25+
curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz && \
26+
tar zxf openmpi-4.0.1.tar.gz && \
27+
cd openmpi-4.0.1 && \
28+
./configure --enable-orterun-prefix-by-default && \
29+
make -j $(nproc) all && \
30+
make install && \
31+
ldconfig && \
32+
rm -rf /tmp/openmpi
33+
34+
# Create a wrapper for OpenMPI to allow running as root by default
35+
RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
36+
echo '#!/bin/bash' > /usr/local/bin/mpirun && \
37+
echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \
38+
chmod a+x /usr/local/bin/mpirun
39+
40+
RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \
41+
echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
42+
43+
ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
44+
45+
ENV PATH /usr/local/openmpi/bin/:$PATH
46+
47+
# SSH login fix. Otherwise user is kicked off after login
48+
RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
49+
50+
# Create SSH key.
51+
RUN mkdir -p /root/.ssh/ && \
52+
mkdir -p /var/run/sshd && \
53+
ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa && \
54+
cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys && \
55+
printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config
56+
57+
# Set environment variables for MKL
58+
# For more about MKL with TensorFlow see:
59+
# https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn
60+
ENV KMP_AFFINITY=granularity=fine,compact,1,0 KMP_BLOCKTIME=1 KMP_SETTINGS=0
61+
62+
WORKDIR /
63+
64+
ARG PYTHON=python
65+
ARG PYTHON_PIP=python-pip
66+
ARG PIP=pip
67+
68+
RUN apt-get update && apt-get install -y \
69+
${PYTHON} \
70+
${PYTHON_PIP}
71+
72+
ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PYTHONIOENCODING=UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8
73+
74+
ARG framework_support_installable=sagemaker_tensorflow_container-2.0.8.dev0.tar.gz
75+
ARG sagemaker_tensorflow_extensions=tensorflow-2.0.0-cp27-cp27mu-manylinux2010_x86_64.whl
76+
COPY $framework_support_installable .
77+
COPY $sagemaker_tensorflow_extensions .
78+
79+
RUN ${PIP} --no-cache-dir install --upgrade pip setuptools
80+
81+
# Some TF tools expect a "python" binary
82+
RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
83+
84+
RUN ${PIP} install --no-cache-dir -U \
85+
numpy==1.16.4 \
86+
scipy==1.2.2 \
87+
scikit-learn==0.20.3 \
88+
pandas==0.24.2 \
89+
Pillow==6.1.0 \
90+
h5py==2.9.0 \
91+
keras_applications==1.0.8 \
92+
keras_preprocessing==1.1.0 \
93+
requests==2.22.0 \
94+
keras==2.2.4 \
95+
awscli==1.16.196 \
96+
mpi4py==3.0.2 \
97+
$sagemaker_tensorflow_extensions \
98+
# Let's install TensorFlow separately in the end to avoid
99+
# the library version to be overwritten
100+
# && ${PIP} install --force-reinstall --no-cache-dir -U ${TF_URL} \
101+
&& ${PIP} install --no-cache-dir -U $framework_support_installable && \
102+
rm -f $framework_support_installable \
103+
&& ${PIP} install --no-cache-dir -U horovod==0.18.2 \
104+
&& ${PIP} uninstall -y --no-cache-dir \
105+
markdown
106+
107+
ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
108+
109+
CMD ["bin/bash"]

docker/2.0.0/py2/Dockerfile.gpu

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
FROM nvidia/cuda:10.0-base-ubuntu18.04
2+
3+
LABEL maintainer="Amazon AI"
4+
5+
# prevent stopping by user interaction
6+
ENV DEBIAN_FRONTEND noninteractive
7+
ENV DEBCONF_NONINTERACTIVE_SEEN true
8+
9+
RUN apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \
10+
ca-certificates \
11+
cuda-command-line-tools-10-0 \
12+
cuda-cublas-dev-10-0 \
13+
cuda-cudart-dev-10-0 \
14+
cuda-cufft-dev-10-0 \
15+
cuda-curand-dev-10-0 \
16+
cuda-cusolver-dev-10-0 \
17+
cuda-cusparse-dev-10-0 \
18+
curl \
19+
libcudnn7=7.5.1.10-1+cuda10.0 \
20+
# TensorFlow doesn't require libnccl anymore but Open MPI still depends on it
21+
libnccl2=2.4.7-1+cuda10.0 \
22+
libgomp1 \
23+
libnccl-dev=2.4.7-1+cuda10.0 \
24+
libfreetype6-dev \
25+
libhdf5-serial-dev \
26+
libpng-dev \
27+
libzmq3-dev \
28+
git \
29+
wget \
30+
vim \
31+
build-essential \
32+
openssh-client \
33+
openssh-server \
34+
zlib1g-dev && \
35+
# The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0
36+
# adds a new list which contains libnvinfer library, so it needs another
37+
# 'apt-get update' to retrieve that list before it can actually install the
38+
# library.
39+
# We don't install libnvinfer-dev since we don't need to build against TensorRT,
40+
# and libnvinfer4 doesn't contain libnvinfer.a static library.
41+
apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \
42+
nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 && \
43+
apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \
44+
libnvinfer5=5.0.2-1+cuda10.0 && \
45+
rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* && \
46+
rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* && \
47+
rm /usr/lib/x86_64-linux-gnu/libnvparsers* && \
48+
rm -rf /var/lib/apt/lists/* && \
49+
mkdir -p /var/run/sshd
50+
51+
# Install Open MPI
52+
RUN mkdir /tmp/openmpi && \
53+
cd /tmp/openmpi && \
54+
curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz && \
55+
tar zxf openmpi-4.0.1.tar.gz && \
56+
cd openmpi-4.0.1 && \
57+
./configure --enable-orterun-prefix-by-default && \
58+
make -j $(nproc) all && \
59+
make install && \
60+
ldconfig && \
61+
rm -rf /tmp/openmpi
62+
63+
ARG PYTHON=python
64+
ARG PYTHON_PIP=python-pip
65+
ARG PIP=pip
66+
67+
RUN apt-get update && apt-get install -y \
68+
${PYTHON} \
69+
${PYTHON_PIP}
70+
71+
# Create a wrapper for OpenMPI to allow running as root by default
72+
RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
73+
echo '#!/bin/bash' > /usr/local/bin/mpirun && \
74+
echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \
75+
chmod a+x /usr/local/bin/mpirun
76+
77+
# Configure OpenMPI to run good defaults:
78+
# --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0
79+
RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \
80+
echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
81+
82+
# Set default NCCL parameters
83+
RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf
84+
85+
ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
86+
ENV PATH /usr/local/openmpi/bin/:$PATH
87+
ENV PATH=/usr/local/nvidia/bin:$PATH
88+
89+
# SSH login fix. Otherwise user is kicked off after login
90+
RUN mkdir -p /var/run/sshd && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
91+
92+
# Create SSH key.
93+
RUN mkdir -p /root/.ssh/ && \
94+
ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa && \
95+
cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys && \
96+
printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config
97+
98+
###########################################################################
99+
# Python won’t try to write .pyc or .pyo files on the import of source modules
100+
ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PYTHONIOENCODING=UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8
101+
102+
WORKDIR /
103+
104+
RUN ${PIP} --no-cache-dir install --upgrade pip setuptools
105+
106+
# Some TF tools expect a "python" binary
107+
RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
108+
109+
ARG framework_support_installable=sagemaker_tensorflow_container-2.0.8.dev0.tar.gz
110+
ARG sagemaker_tensorflow_extensions=tensorflow_gpu-2.0.0-cp27-cp27mu-manylinux2010_x86_64.whl
111+
COPY $framework_support_installable .
112+
COPY $sagemaker_tensorflow_extensions .
113+
114+
RUN ${PIP} install --no-cache-dir -U \
115+
numpy==1.16.4 \
116+
scipy==1.2.2 \
117+
scikit-learn==0.20.3 \
118+
pandas==0.24.2 \
119+
Pillow==6.1.0 \
120+
h5py==2.9.0 \
121+
keras_applications==1.0.8 \
122+
keras_preprocessing==1.1.0 \
123+
requests==2.22.0 \
124+
keras==2.2.4 \
125+
awscli==1.16.196 \
126+
mpi4py==3.0.2 \
127+
$sagemaker_tensorflow_extensions \
128+
# Let's install TensorFlow separately in the end to avoid
129+
# the library version to be overwritten
130+
# && ${PIP} install --force-reinstall --no-cache-dir -U ${TF_URL} \
131+
&& ${PIP} install --no-cache-dir -U $framework_support_installable && \
132+
rm -f $framework_support_installable \
133+
&& ${PIP} uninstall -y --no-cache-dir \
134+
markdown
135+
136+
# Install Horovod, temporarily using CUDA stubs
137+
RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs && \
138+
HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 ${PIP} install --no-cache-dir horovod==0.18.2 && \
139+
ldconfig
140+
141+
# Allow OpenSSH to talk to containers without asking for confirmation
142+
RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
143+
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
144+
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
145+
146+
ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
147+
148+
CMD ["bin/bash"]

docker/2.0.0/py3/Dockerfile.cpu

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
FROM ubuntu:18.04
2+
3+
LABEL maintainer="Amazon AI"
4+
5+
# prevent stopping by user interaction
6+
ENV DEBIAN_FRONTEND noninteractive
7+
ENV DEBCONF_NONINTERACTIVE_SEEN true
8+
9+
RUN apt-get update && apt-get install -y --no-install-recommends \
10+
software-properties-common \
11+
build-essential \
12+
openssh-client \
13+
openssh-server \
14+
ca-certificates \
15+
curl \
16+
git \
17+
wget \
18+
vim \
19+
zlib1g-dev \
20+
&& rm -rf /var/lib/apt/lists/*
21+
22+
# Install Open MPI
23+
RUN mkdir /tmp/openmpi && \
24+
cd /tmp/openmpi && \
25+
curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz && \
26+
tar zxf openmpi-4.0.1.tar.gz && \
27+
cd openmpi-4.0.1 && \
28+
./configure --enable-orterun-prefix-by-default && \
29+
make -j $(nproc) all && \
30+
make install && \
31+
ldconfig && \
32+
rm -rf /tmp/openmpi
33+
34+
# Create a wrapper for OpenMPI to allow running as root by default
35+
RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
36+
echo '#!/bin/bash' > /usr/local/bin/mpirun && \
37+
echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \
38+
chmod a+x /usr/local/bin/mpirun
39+
40+
RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \
41+
echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
42+
43+
ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
44+
45+
ENV PATH /usr/local/openmpi/bin/:$PATH
46+
47+
# SSH login fix. Otherwise user is kicked off after login
48+
RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
49+
50+
# Create SSH key.
51+
RUN mkdir -p /root/.ssh/ && \
52+
mkdir -p /var/run/sshd && \
53+
ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa && \
54+
cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys && \
55+
printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config
56+
57+
# Set environment variables for MKL
58+
# For more about MKL with TensorFlow see:
59+
# https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn
60+
ENV KMP_AFFINITY=granularity=fine,compact,1,0 KMP_BLOCKTIME=1 KMP_SETTINGS=0
61+
62+
WORKDIR /
63+
64+
ARG PYTHON=python3
65+
ARG PYTHON_PIP=python3-pip
66+
ARG PIP=pip3
67+
ARG PYTHON_VERSION=3.6.6
68+
69+
RUN wget https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz && \
70+
tar -xvf Python-$PYTHON_VERSION.tgz && cd Python-$PYTHON_VERSION && \
71+
./configure && make && make install && \
72+
apt-get update && apt-get install -y --no-install-recommends libreadline-gplv2-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev && \
73+
make && make install && rm -rf ../Python-$PYTHON_VERSION* && \
74+
ln -s /usr/local/bin/pip3 /usr/bin/pip
75+
76+
ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PYTHONIOENCODING=UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8
77+
78+
ARG framework_support_installable=sagemaker_tensorflow_container-2.0.8.dev0.tar.gz
79+
ARG sagemaker_tensorflow_extensions=tensorflow-2.0.0-cp36-cp36m-manylinux2010_x86_64.whl
80+
COPY $framework_support_installable .
81+
COPY $sagemaker_tensorflow_extensions .
82+
83+
# ARG TF_URL="https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.14/AmazonLinux/cpu/final/tensorflow-1.14.0-cp36-cp36m-linux_x86_64.whl"
84+
85+
RUN ${PIP} --no-cache-dir install --upgrade pip setuptools
86+
87+
# Some TF tools expect a "python" binary
88+
RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
89+
90+
RUN ${PIP} install --no-cache-dir -U \
91+
numpy==1.16.4 \
92+
scipy==1.2.2 \
93+
scikit-learn==0.20.3 \
94+
pandas==0.24.2 \
95+
Pillow==6.1.0 \
96+
h5py==2.9.0 \
97+
keras_applications==1.0.8 \
98+
keras_preprocessing==1.1.0 \
99+
keras==2.2.4 \
100+
requests==2.22.0 \
101+
awscli==1.16.196 \
102+
mpi4py==3.0.2 \
103+
$sagemaker_tensorflow_extensions && \
104+
# "sagemaker-tensorflow>=1.14,<1.15" && \
105+
# Let's install TensorFlow separately in the end to avoid
106+
# the library version to be overwritten
107+
${PIP} install --force-reinstall --no-cache-dir -U \
108+
# ${TF_URL} \
109+
horovod==0.18.2 && \
110+
${PIP} install --no-cache-dir -U $framework_support_installable && \
111+
rm -f $framework_support_installable && \
112+
${PIP} uninstall -y --no-cache-dir \
113+
markdown
114+
115+
ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
116+
117+
CMD ["bin/bash"]

0 commit comments

Comments
 (0)