feature: use tensorflow 2.3.1 and add data parallel integ test (#411)

Dan · web-flow · commit 92eb20a5d6e5 · 2020-12-11T17:23:34.000-08:00
diff --git a/buildspec.yml b/buildspec.yml
@@ -2,7 +2,7 @@ version: 0.2
 
 env:
   variables:
-    FRAMEWORK_VERSION: '2.2.0'
+    FRAMEWORK_VERSION: '2.3.1'
     CPU_INSTANCE_TYPE: 'ml.c4.xlarge'
     GPU_INSTANCE_TYPE: 'ml.p2.xlarge'
     ECR_REPO: 'sagemaker-test'
@@ -61,23 +61,23 @@ phases:
       # run GPU local integration tests
       - printf "$SETUP_CMDS" > $SETUP_FILE
       # no reason to rebuild the image again since it was already built and pushed to ECR during CPU tests
-      - generic_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $GENERIC_TAG"
+      - generic_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local --dockerfile-type tf --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $GENERIC_TAG"
       - test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$generic_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\""
       - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml"
-      - dlc_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $DLC_GPU_TAG"
+      - dlc_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local --dockerfile-type dlc.gpu --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $DLC_GPU_TAG"
       - test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$dlc_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\" --skip-setup"
       - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml"
 
       # run CPU sagemaker integration tests
-      - test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $GENERIC_TAG"
+      - test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker -n auto --reruns 3 --reruns-delay 15 --dockerfile-type tf --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $GENERIC_TAG"
       - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml"
-      - test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $DLC_CPU_TAG"
+      - test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker -n auto --reruns 3 --reruns-delay 15 --dockerfile-type dlc.cpu --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $DLC_CPU_TAG"
       - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml"
 
       # run GPU sagemaker integration tests
-      - test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $GENERIC_TAG"
+      - test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker -n auto --reruns 3 --reruns-delay 15 --dockerfile-type tf --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $GENERIC_TAG"
       - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml"
-      - test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $DLC_GPU_TAG"
+      - test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker -n auto --reruns 3 --reruns-delay 15 --dockerfile-type dlc.gpu --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $DLC_GPU_TAG"
       - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml"
     finally:
       # shut down remote GPU instance
diff --git a/setup.py b/setup.py
@@ -35,6 +35,7 @@ def read_version():
     "pytest",
     "pytest-cov",
     "pytest-xdist",
+    "pytest-rerunfailures",
     "mock",
     "sagemaker[local]>=2",
     "tensorflow<2.4",
diff --git a/test/container/2.3.1/Dockerfile.dlc.cpu b/test/container/2.3.1/Dockerfile.dlc.cpu
@@ -0,0 +1,6 @@
+ARG region
+FROM 763104351884.dkr.ecr.$region.amazonaws.com/tensorflow-training:2.3.1-cpu-py37
+
+COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz
+RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \
+    rm /sagemaker_tensorflow_training.tar.gz
diff --git a/test/container/2.3.1/Dockerfile.dlc.gpu b/test/container/2.3.1/Dockerfile.dlc.gpu
@@ -0,0 +1,6 @@
+ARG region
+FROM 763104351884.dkr.ecr.$region.amazonaws.com/tensorflow-training:2.3.1-gpu-py37-cu110-ubuntu18.04
+
+COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz
+RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \
+    rm /sagemaker_tensorflow_training.tar.gz
diff --git a/test/container/2.3.1/Dockerfile.tf b/test/container/2.3.1/Dockerfile.tf
@@ -0,0 +1,7 @@
+FROM tensorflow/tensorflow:2.3.1-gpu
+
+ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
+
+COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz
+RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \
+    rm /sagemaker_tensorflow_training.tar.gz
diff --git a/test/integration/sagemaker/test_smdataparallel.py b/test/integration/sagemaker/test_smdataparallel.py
@@ -0,0 +1,57 @@
+# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
+
+import os
+
+import pytest
+import sagemaker
+from sagemaker.tensorflow import TensorFlow
+from sagemaker.utils import unique_name_from_base
+
+from integration import DEFAULT_TIMEOUT, RESOURCE_PATH
+from integration.sagemaker.timeout import timeout
+
+
+@pytest.mark.skip_cpu
+@pytest.mark.skip_generic
+@pytest.mark.parametrize(
+    "instances, instance_type",
+    [(2, "ml.p3.16xlarge")],
+)
+def test_smdataparallel_training(instances, instance_type, sagemaker_session, image_uri, framework_version, tmpdir):
+    default_bucket = sagemaker_session.default_bucket()
+    output_path = "s3://{}/{}/{}".format(default_bucket, "tensorflow", "smdataparallel")
+
+    estimator = TensorFlow(
+        entry_point=os.path.join(RESOURCE_PATH, "mnist", "smdataparallel_mnist.py"),
+        role="SageMakerRole",
+        instance_type=instance_type,
+        sagemaker_session=sagemaker_session,
+        instance_count=instances,
+        image_uri=image_uri,
+        output_path=output_path,
+        framework_version=framework_version,
+        py_version="py3",
+        distribution={"smdistributed": {"dataparallel": {"enabled": True}}}
+    )
+
+    with timeout(minutes=DEFAULT_TIMEOUT):
+        estimator.fit(job_name=unique_name_from_base("test-tf-smdataparallel"))
+
+        model_data_source = sagemaker.local.data.get_data_source_instance(
+            estimator.model_data, sagemaker_session
+        )
+
+        for filename in model_data_source.get_file_list():
+            assert os.path.basename(filename) == "model.tar.gz"
diff --git a/test/resources/mnist/smdataparallel_mnist.py b/test/resources/mnist/smdataparallel_mnist.py
@@ -0,0 +1,83 @@
+# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+import tensorflow as tf
+
+import smdistributed.dataparallel.tensorflow as dist
+
+tf.random.set_seed(42)
+
+dist.init()
+
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+if gpus:
+    tf.config.experimental.set_visible_devices(gpus[dist.local_rank()], "GPU")
+
+(mnist_images, mnist_labels), _ = tf.keras.datasets.mnist.load_data(
+    path="mnist-%d.npz" % dist.rank()
+)
+
+dataset = tf.data.Dataset.from_tensor_slices(
+    (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), tf.cast(mnist_labels, tf.int64))
+)
+dataset = dataset.repeat().shuffle(10000).batch(128)
+
+mnist_model = tf.keras.Sequential(
+    [
+        tf.keras.layers.Conv2D(32, [3, 3], activation="relu"),
+        tf.keras.layers.Conv2D(64, [3, 3], activation="relu"),
+        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
+        tf.keras.layers.Dropout(0.25),
+        tf.keras.layers.Flatten(),
+        tf.keras.layers.Dense(128, activation="relu"),
+        tf.keras.layers.Dropout(0.5),
+        tf.keras.layers.Dense(10, activation="softmax"),
+    ]
+)
+loss = tf.losses.SparseCategoricalCrossentropy()
+# LR for 8 node run : 0.000125
+# LR for single node run : 0.001
+opt = tf.optimizers.Adam(0.000125 * dist.size())
+
+checkpoint_dir = "./checkpoints"
+checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt)
+
+
+@tf.function
+def training_step(images, labels, first_batch):
+    with tf.GradientTape() as tape:
+        probs = mnist_model(images, training=True)
+        loss_value = loss(labels, probs)
+
+    tape = dist.DistributedGradientTape(tape)
+
+    grads = tape.gradient(loss_value, mnist_model.trainable_variables)
+    opt.apply_gradients(zip(grads, mnist_model.trainable_variables))
+
+    if first_batch:
+        dist.broadcast_variables(mnist_model.variables, root_rank=0)
+        dist.broadcast_variables(opt.variables(), root_rank=0)
+
+    loss_value = dist.oob_allreduce(loss_value)  # Average the loss across workers
+    return loss_value
+
+
+for batch, (images, labels) in enumerate(dataset.take(10000 // dist.size())):
+    loss_value = training_step(images, labels, batch == 0)
+
+    if batch % 50 == 0 and dist.rank() == 0:
+        print("Step #%d\tLoss: %.6f" % (batch, loss_value))
+
+if dist.rank() == 0:
+    checkpoint.save(checkpoint_dir)