Use executing_eagerly_outside_functions for global execution context (#178)

vandanavk · web-flow · commit b1b3bac09c55 · 2020-03-13T22:39:06.000-07:00
Fixing a problem with TF 2.1 training where gradients are not emitted.
diff --git a/setup.py b/setup.py
@@ -47,7 +47,7 @@ def build_package(version):
         name="smdebug",
         version=version,
         long_description="\n".join(DOCLINES[1:]),
-        long_description_content_type="text/x-rst",
+        long_description_content_type="text/markdown",
         author="AWS DeepLearning Team",
         description=DOCLINES[0],
         url="https://github.com/awslabs/sagemaker-debugger",
diff --git a/smdebug/_version.py b/smdebug/_version.py
@@ -1 +1 @@
-__version__ = "0.7.0"
+__version__ = "0.7.1"
diff --git a/smdebug/tensorflow/base_hook.py b/smdebug/tensorflow/base_hook.py
@@ -6,6 +6,7 @@
 # Third Party
 import tensorflow.compat.v1 as tf
 from tensorflow.python.distribute.distribute_lib import _DefaultDistributionStrategy
+from tensorflow.python.framework import ops
 
 # First Party
 from smdebug.core.collection import DEFAULT_TF_COLLECTIONS
@@ -420,7 +421,11 @@ def set_gradients(self, gradients=None, gradients_and_variables=None):
         # TF 2.x doesn't provide gradient/optimizer variable names and values by default.
         # Skipping set_gradients and set_optimizer_variables for Tf 2.x until there is
         # support to pass names and values from TF side.
-        if is_tf_version_2x() and tf.executing_eagerly():
+
+        # From TF 2.2, executing_eagerly_outside_functions() can be used as
+        # ops.executing_eagerly_outside_functions() or tf.compat.v1.executing_eagerly_outside_functions().
+        # But in TF 2.1, only ops.executing_eagerly_outside_functions() is valid
+        if is_tf_version_2x() and ops.executing_eagerly_outside_functions():
             return
         if self._gradients_set is False:
             if gradients is not None:
@@ -441,7 +446,11 @@ def set_optimizer_variables(self, optimizer_variables):
         # TF 2.x doesn't provide gradient/optimizer variable names and values by default.
         # Skipping set_gradients and set_optimizer_variables for Tf 2.x until there is
         # support to pass names and values from TF side.
-        if is_tf_version_2x() and tf.executing_eagerly():
+
+        # From TF 2.2, executing_eagerly_outside_functions() can be used as
+        # ops.executing_eagerly_outside_functions() or tf.compat.v1.executing_eagerly_outside_functions().
+        # But in TF 2.1, only ops.executing_eagerly_outside_functions() is valid
+        if is_tf_version_2x() and ops.executing_eagerly_outside_functions():
             return
         # since this is done for each variable at a time for keras, not checking if set already
         self.collection_manager.get(CollectionKeys.OPTIMIZER_VARIABLES).add_for_mode(
diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py
@@ -16,6 +16,7 @@
 import smdebug.tensorflow as smd
 from smdebug.core.access_layer import has_training_ended
 from smdebug.core.collection import CollectionKeys
+from smdebug.core.json_config import CONFIG_FILE_PATH_ENV_STR
 from smdebug.core.reduction_config import ALLOWED_NORMS, ALLOWED_REDUCTIONS
 from smdebug.exceptions import TensorUnavailableForStep
 from smdebug.tensorflow import ReductionConfig, SaveConfig
@@ -238,3 +239,55 @@ def test_weights_collections(out_dir, tf_eager_mode):
     assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 2
     assert len(trial.tensor_names(collection=CollectionKeys.LOSSES)) == 1
     assert len(trial.tensor_names(collection=CollectionKeys.METRICS)) == 3
+
+
+@pytest.mark.slow
+def test_include_collections(out_dir, tf_eager_mode):
+    include_collections = [
+        CollectionKeys.WEIGHTS,
+        CollectionKeys.BIASES,
+        CollectionKeys.GRADIENTS,
+        CollectionKeys.LOSSES,
+        CollectionKeys.OUTPUTS,
+        CollectionKeys.METRICS,
+        CollectionKeys.OPTIMIZER_VARIABLES,
+    ]
+    save_config = SaveConfig(save_interval=3)
+    hook = smd.KerasHook(
+        out_dir,
+        save_config=save_config,
+        include_collections=include_collections,
+        reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS),
+    )
+    helper_keras_fit(out_dir, hook=hook, steps=["train", "eval", "predict"], eager=tf_eager_mode)
+
+    trial = smd.create_trial(path=out_dir)
+    # can't save gradients in TF 2.x
+    if tf_eager_mode:
+        assert len(trial.tensor_names()) == 8
+    else:
+        assert len(trial.tensor_names()) == 18
+        assert len(trial.tensor_names(collection=CollectionKeys.GRADIENTS)) == 4
+        assert len(trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES)) == 5
+    assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 2
+    assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 2
+    assert len(trial.tensor_names(collection=CollectionKeys.LOSSES)) == 1
+    assert len(trial.tensor_names(collection=CollectionKeys.METRICS)) == 3
+
+
+@pytest.mark.slow
+def test_hook_from_json(out_dir, tf_eager_mode, monkeypatch):
+    monkeypatch.setenv(
+        CONFIG_FILE_PATH_ENV_STR,
+        "tests/tensorflow/hooks/test_json_configs/test_collection_defaults.json",
+    )
+    hook = smd.KerasHook.create_from_json_file()
+    helper_keras_fit(out_dir, hook=hook, steps=["train"], eager=tf_eager_mode)
+
+    trial = smd.create_trial(path=out_dir)
+    # can't save gradients in TF 2.x
+    assert len(trial.tensor_names()) == 6
+    assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 0
+    assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 2
+    assert len(trial.tensor_names(collection=CollectionKeys.LOSSES)) == 1
+    assert len(trial.tensor_names(collection=CollectionKeys.METRICS)) == 3
diff --git a/tests/zero_code_change/tensorflow2_integration_tests.py b/tests/zero_code_change/tensorflow2_integration_tests.py
@@ -45,28 +45,31 @@ def get_keras_data():
     return (x_train, y_train), (x_test, y_test)
 
 
-def test_keras_v2(script_mode: bool = False, eager_mode: bool = True):
-    """ Works as intended. """
+def helper_test_keras_v2(script_mode: bool = False, eager_mode: bool = True):
+    """ Test the default ZCC behavior of saving losses and metrics in eager and non-eager modes."""
     smd.del_hook()
-
+    tf.keras.backend.clear_session()
     if not eager_mode:
         tf.compat.v1.disable_eager_execution()
     with SagemakerSimulator() as sim:
         model = get_keras_model_v2()
         (x_train, y_train), (x_test, y_test) = get_keras_data()
 
-        model.compile(
-            loss="sparse_categorical_crossentropy",
-            optimizer=tf.keras.optimizers.RMSprop(),
-            metrics=["accuracy"],
-        )
+        opt = tf.keras.optimizers.RMSprop()
         if script_mode:
             hook = smd.KerasHook(out_dir=sim.out_dir, export_tensorboard=True)
+            opt = hook.wrap_optimizer(opt)
+            model.compile(
+                loss="sparse_categorical_crossentropy", optimizer=opt, metrics=["accuracy"]
+            )
             history = model.fit(
                 x_train, y_train, batch_size=64, epochs=5, validation_split=0.2, callbacks=[hook]
             )
             test_scores = model.evaluate(x_test, y_test, verbose=2, callbacks=[hook])
         else:
+            model.compile(
+                loss="sparse_categorical_crossentropy", optimizer=opt, metrics=["accuracy"]
+            )
             history = model.fit(x_train, y_train, batch_size=64, epochs=5, validation_split=0.2)
             test_scores = model.evaluate(x_test, y_test, verbose=2)
 
@@ -77,6 +80,106 @@ def test_keras_v2(script_mode: bool = False, eager_mode: bool = True):
         trial = smd.create_trial(path=sim.out_dir)
         assert len(trial.steps()) > 0, "Nothing saved at any step."
         assert len(trial.tensor_names()) > 0, "Tensors were not saved."
+        assert len(trial.tensor_names(collection="losses")) > 0
+
+
+def helper_test_keras_v2_json_config(
+    json_file_contents, script_mode: bool = False, eager_mode: bool = True
+):
+    """ Tests ZCC with custom hook configs """
+    smd.del_hook()
+    tf.keras.backend.clear_session()
+    if not eager_mode:
+        tf.compat.v1.disable_eager_execution()
+    with SagemakerSimulator(json_file_contents=json_file_contents) as sim:
+        model = get_keras_model_v2()
+        (x_train, y_train), (x_test, y_test) = get_keras_data()
+
+        opt = tf.keras.optimizers.RMSprop()
+        if script_mode:
+            hook = smd.KerasHook.create_from_json_file()
+            opt = hook.wrap_optimizer(opt)
+            model.compile(
+                loss="sparse_categorical_crossentropy", optimizer=opt, metrics=["accuracy"]
+            )
+            history = model.fit(
+                x_train, y_train, batch_size=64, epochs=5, validation_split=0.2, callbacks=[hook]
+            )
+            test_scores = model.evaluate(x_test, y_test, verbose=2, callbacks=[hook])
+        else:
+            model.compile(
+                loss="sparse_categorical_crossentropy", optimizer=opt, metrics=["accuracy"]
+            )
+            history = model.fit(x_train, y_train, epochs=5, batch_size=64, validation_split=0.2)
+            test_scores = model.evaluate(x_test, y_test, verbose=2)
+
+        hook = smd.get_hook()
+        assert hook
+        hook.close()
+        # Check that hook created and tensors saved
+        trial = smd.create_trial(path=sim.out_dir)
+        assert len(trial.steps()) > 0, "Nothing saved at any step."
+        assert len(trial.tensor_names()) > 0, "Tensors were not saved."
+        if not eager_mode:
+            assert len(trial.tensor_names(collection="gradients")) > 0
+        assert len(trial.tensor_names(collection="weights")) > 0
+        assert len(trial.tensor_names(collection="losses")) > 0
+
+
+def test_keras_v2_default(script_mode: bool = False, eager_mode: bool = True):
+    # Test default ZCC behavior
+    helper_test_keras_v2(script_mode=script_mode, eager_mode=eager_mode)
+
+
+def test_keras_v2_multi_collections(script_mode: bool = False, eager_mode: bool = True):
+    # Test multiple collections included in hook json
+    json_file_contents = """
+            {
+                "S3OutputPath": "s3://sagemaker-test",
+                "LocalPath": "/opt/ml/output/tensors",
+                "HookParameters" : {
+                    "save_interval": "2",
+                    "include_workers": "all"
+                },
+                "CollectionConfigurations": [
+                    {
+                        "CollectionName": "gradients"
+                    },
+                    {
+                        "CollectionName": "weights"
+                    },
+                    {
+                        "CollectionName": "losses"
+                    },
+                    {
+                        "CollectionName": "biases"
+                    },
+                    {
+                        "CollectionName": "optimizer_variables"
+                    }
+                ]
+            }
+            """
+    helper_test_keras_v2_json_config(
+        script_mode=script_mode, eager_mode=eager_mode, json_file_contents=json_file_contents
+    )
+
+
+def test_keras_v2_save_all(script_mode: bool = False, eager_mode: bool = True):
+    # Test save all through hook config
+    json_file_contents = """
+            {
+                "S3OutputPath": "s3://sagemaker-test",
+                "LocalPath": "/opt/ml/output/tensors",
+                "HookParameters" : {
+                    "save_steps": "0,1,2,3",
+                    "save_all": true
+                }
+            }
+            """
+    helper_test_keras_v2_json_config(
+        script_mode=script_mode, eager_mode=eager_mode, json_file_contents=json_file_contents
+    )
 
 
 if __name__ == "__main__":
@@ -88,6 +191,11 @@ def test_keras_v2(script_mode: bool = False, eager_mode: bool = True):
     script_mode = args.script_mode
 
     # eager mode
-    test_keras_v2(script_mode=script_mode)
+    test_keras_v2_default(script_mode)
+    test_keras_v2_multi_collections(script_mode)
+    test_keras_v2_save_all(script_mode)
+
     # non-eager mode
-    test_keras_v2(script_mode=script_mode, eager_mode=False)
+    test_keras_v2_default(script_mode, eager_mode=False)
+    test_keras_v2_multi_collections(script_mode, eager_mode=False)
+    test_keras_v2_save_all(script_mode, eager_mode=False)

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.7.0"`
	`1`	`+__version__ = "0.7.1"`