Skip to content

Commit fbc0b1e

Browse files
sboshinchuyang-deng
authored andcommitted
Upgrading Dockerfile packages, and increasing throttling from 0 to 1 second (#258)
* Upgrading awscli, pinning PYYaml and python-dateutil, keeping markdown * Line ending change * Increasing throttling from 0 to 1, to alleviate eventual consistency with s3
1 parent bea33dd commit fbc0b1e

File tree

5 files changed

+38
-25
lines changed

5 files changed

+38
-25
lines changed

docker/2.0.0/py2/Dockerfile.cpu

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,8 @@ RUN ${PIP} --no-cache-dir install --upgrade \
9090
# Some TF tools expect a "python" binary
9191
RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
9292

93+
# install PyYAML==5.1.2 to avoid conflict with latest awscli
94+
# # python-dateutil==2.8.0 to satisfy botocore associated with latest awscli
9395
RUN ${PIP} install --no-cache-dir -U \
9496
numpy==1.16.5 \
9597
scipy==1.2.2 \
@@ -101,20 +103,22 @@ RUN ${PIP} install --no-cache-dir -U \
101103
keras_preprocessing==1.1.0 \
102104
requests==2.22.0 \
103105
keras==2.3.1 \
104-
awscli \
106+
python-dateutil==2.8.0 \
107+
PyYAML==5.1.2 \
108+
awscli==1.16.303 \
105109
mpi4py==3.0.2 \
110+
"cryptography>=2.3" \
111+
"sagemaker-tensorflow>=2.0,<2.1" \
106112
# Let's install TensorFlow separately in the end to avoid
107113
# the library version to be overwritten
108-
# ${PIP} install --force-reinstall --no-cache-dir -U ${TF_URL} \
109-
&& ${PIP} install --force-reinstall --no-cache-dir -U \
114+
# ${PIP} install --no-cache-dir -U ${TF_URL} \
115+
&& ${PIP} install --no-cache-dir -U \
110116
$TENSORFLOW_WHL \
111117
&& ${PIP} install --no-cache-dir -U \
112118
$FRAMEWORK_SUPPORT_INSTALLABLE \
113119
&& rm -f $FRAMEWORK_SUPPORT_INSTALLABLE \
114120
&& ${PIP} install --no-cache-dir -U \
115-
horovod==0.18.2 \
116-
&& ${PIP} uninstall -y --no-cache-dir \
117-
markdown
121+
horovod==0.18.2
118122

119123
COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py
120124
COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py

docker/2.0.0/py2/Dockerfile.gpu

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,8 @@ RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
118118
COPY $FRAMEWORK_SUPPORT_INSTALLABLE .
119119
COPY $TENSORFLOW_WHL .
120120

121+
# install PyYAML==5.1.2 to avoid conflict with latest awscli
122+
# # python-dateutil==2.8.0 to satisfy botocore associated with latest awscli
121123
RUN ${PIP} install --no-cache-dir -U \
122124
numpy==1.16.5 \
123125
scipy==1.2.2 \
@@ -129,18 +131,20 @@ RUN ${PIP} install --no-cache-dir -U \
129131
keras_preprocessing==1.1.0 \
130132
requests==2.22.0 \
131133
keras==2.3.1 \
132-
awscli \
134+
python-dateutil==2.8.0 \
135+
PyYAML==5.1.2 \
136+
awscli==1.16.303 \
133137
mpi4py==3.0.2 \
138+
"cryptography>=2.3" \
139+
"sagemaker-tensorflow>=2.0,<2.1" \
134140
# Let's install TensorFlow separately in the end to avoid
135141
# the library version to be overwritten
136-
# ${PIP} install --force-reinstall --no-cache-dir -U ${TF_URL} \
137-
&& ${PIP} install --force-reinstall --no-cache-dir -U \
142+
# ${PIP} install --no-cache-dir -U ${TF_URL} \
143+
&& ${PIP} install --no-cache-dir -U \
138144
$TENSORFLOW_WHL \
139145
&& ${PIP} install --no-cache-dir -U \
140146
$FRAMEWORK_SUPPORT_INSTALLABLE \
141-
&& rm -f $FRAMEWORK_SUPPORT_INSTALLABLE \
142-
&& ${PIP} uninstall -y --no-cache-dir \
143-
markdown
147+
&& rm -f $FRAMEWORK_SUPPORT_INSTALLABLE
144148

145149
# Install Horovod, temporarily using CUDA stubs
146150
RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs \

docker/2.0.0/py3/Dockerfile.cpu

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ RUN ${PIP} --no-cache-dir install --upgrade \
9393
RUN ln -s $(which ${PYTHON}) /usr/local/bin/python \
9494
&& ln -s $(which ${PIP}) /usr/bin/pip
9595

96+
# install PyYAML==5.1.2 to avoid conflict with latest awscli
97+
# # python-dateutil==2.8.0 to satisfy botocore associated with latest awscli
9698
RUN ${PIP} install --no-cache-dir -U \
9799
numpy==1.17.4 \
98100
scipy==1.2.2 \
@@ -103,20 +105,21 @@ RUN ${PIP} install --no-cache-dir -U \
103105
keras_applications==1.0.8 \
104106
keras_preprocessing==1.1.0 \
105107
keras==2.3.1 \
108+
python-dateutil==2.8.0 \
109+
PyYAML==5.1.2 \
106110
requests==2.22.0 \
107-
awscli==1.16.196 \
111+
awscli==1.16.303 \
108112
mpi4py==3.0.2 \
109113
"sagemaker-tensorflow>=2.0,<2.1" \
110114
# Let's install TensorFlow separately in the end to avoid
111115
# the library version to be overwritten
112-
&& ${PIP} install --force-reinstall --no-cache-dir -U ${TF_URL} \
116+
# ${PIP} install --no-cache-dir -U ${TF_URL} \
117+
&& ${PIP} install --no-cache-dir -U \
113118
$TENSORFLOW_WHL \
114119
horovod==0.18.2 \
115120
&& ${PIP} install --no-cache-dir -U \
116121
$FRAMEWORK_SUPPORT_INSTALLABLE \
117-
&& rm -f $FRAMEWORK_SUPPORT_INSTALLABLE \
118-
&& ${PIP} uninstall -y --no-cache-dir \
119-
markdown
122+
&& rm -f $FRAMEWORK_SUPPORT_INSTALLABLE
120123

121124
COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py
122125
COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py

docker/2.0.0/py3/Dockerfile.gpu

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,8 @@ RUN ln -s $(which ${PYTHON}) /usr/local/bin/python \
132132
COPY $FRAMEWORK_SUPPORT_INSTALLABLE .
133133
COPY $TENSORFLOW_WHL .
134134

135+
# install PyYAML==5.1.2 to avoid conflict with latest awscli
136+
# # python-dateutil==2.8.0 to satisfy botocore associated with latest awscli
135137
RUN ${PIP} install --no-cache-dir -U \
136138
numpy==1.17.4 \
137139
scipy==1.2.2 \
@@ -143,19 +145,19 @@ RUN ${PIP} install --no-cache-dir -U \
143145
keras_preprocessing==1.1.0 \
144146
requests==2.22.0 \
145147
keras==2.3.1 \
146-
awscli==1.16.196 \
148+
python-dateutil==2.8.0 \
149+
PyYAML==5.1.2 \
150+
awscli==1.16.303 \
147151
mpi4py==3.0.2 \
148152
"sagemaker-tensorflow>=2.0,<2.1" \
149153
# Let's install TensorFlow separately in the end to avoid
150154
# the library version to be overwritten
151-
# ${PIP} install --force-reinstall --no-cache-dir -U ${TF_URL} \
152-
&& ${PIP} install --force-reinstall --no-cache-dir -U \
155+
# ${PIP} install --no-cache-dir -U ${TF_URL} \
156+
&& ${PIP} install --no-cache-dir -U \
153157
$TENSORFLOW_WHL \
154158
&& ${PIP} install --no-cache-dir -U \
155159
$FRAMEWORK_SUPPORT_INSTALLABLE \
156-
&& rm -f $FRAMEWORK_SUPPORT_INSTALLABLE \
157-
&& ${PIP} uninstall -y --no-cache-dir \
158-
markdown
160+
&& rm -f $FRAMEWORK_SUPPORT_INSTALLABLE
159161

160162
# Install Horovod, temporarily using CUDA stubs
161163
RUN ldconfig /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs \

test/integration/sagemaker/test_mnist.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,8 +89,8 @@ def test_s3_plugin(sagemaker_session, ecr_image, instance_type, region, framewor
8989
hyperparameters={
9090
# Saving a checkpoint after every 5 steps to hammer the S3 plugin
9191
'save-checkpoint-steps': 10,
92-
# Disable throttling for checkpoint and model saving
93-
'throttle-secs': 0,
92+
# Reducing throttling for checkpoint and model saving
93+
'throttle-secs': 1,
9494
# Without the patch training jobs would fail around 100th to
9595
# 150th step
9696
'max-steps': 200,

0 commit comments

Comments
 (0)