Skip to content

Commit e9f59e9

Browse files
authored
Add smdebug to TF 2.x (#299)
1 parent f38ee68 commit e9f59e9

File tree

4 files changed

+125
-0
lines changed

4 files changed

+125
-0
lines changed

docker/2.1.0/py3/Dockerfile.cpu

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ RUN ${PIP} install --no-cache-dir -U \
103103
keras_applications==1.0.8 \
104104
keras_preprocessing==1.1.0 \
105105
keras==2.3.1 \
106+
smdebug==0.7.0 \
106107
python-dateutil==2.8.1 \
107108
pyYAML==5.2 \
108109
requests==2.22.0 \

docker/2.1.0/py3/Dockerfile.gpu

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ RUN ${PIP} install --no-cache-dir -U \
147147
keras_applications==1.0.8 \
148148
keras_preprocessing==1.1.0 \
149149
keras==2.3.1 \
150+
smdebug==0.7.0 \
150151
python-dateutil==2.8.1 \
151152
pyYAML==5.2 \
152153
requests==2.22.0 \

test/integration/sagemaker/test_mnist.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,26 @@ def test_tuning(sagemaker_session, ecr_image, instance_type, framework_version):
146146
tuner.wait()
147147

148148

149+
def test_smdebug(sagemaker_session, ecr_image, instance_type, framework_version):
150+
resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources')
151+
script = os.path.join(resource_path, 'mnist', 'mnist_smdebug.py')
152+
hyperparameters = {'smdebug_path': '/opt/ml/output/tensors'}
153+
estimator = TensorFlow(entry_point=script,
154+
role='SageMakerRole',
155+
train_instance_type=instance_type,
156+
train_instance_count=1,
157+
sagemaker_session=sagemaker_session,
158+
image_name=ecr_image,
159+
framework_version=framework_version,
160+
script_mode=True,
161+
hyperparameters=hyperparameters)
162+
inputs = estimator.sagemaker_session.upload_data(
163+
path=os.path.join(resource_path, 'mnist', 'data'),
164+
key_prefix='scriptmode/mnist_smdebug')
165+
estimator.fit(inputs, job_name=unique_name_from_base('test-sagemaker-mnist-smdebug'))
166+
_assert_s3_file_exists(sagemaker_session.boto_region_name, estimator.model_data)
167+
168+
149169
def _assert_checkpoint_exists(region, model_dir, checkpoint_number):
150170
_assert_s3_file_exists(region, os.path.join(model_dir, 'graph.pbtxt'))
151171
_assert_s3_file_exists(region,

test/resources/mnist/mnist_smdebug.py

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
import argparse
2+
import json
3+
import os
4+
import sys
5+
6+
import numpy as np
7+
import tensorflow.compat.v2 as tf
8+
9+
import smdebug.tensorflow as smd
10+
from smdebug.core.collection import CollectionKeys
11+
from smdebug.core.reduction_config import ALLOWED_NORMS, ALLOWED_REDUCTIONS
12+
from smdebug.tensorflow import ReductionConfig, SaveConfig
13+
from smdebug.trials import create_trial
14+
15+
16+
def _parse_args():
17+
18+
parser = argparse.ArgumentParser()
19+
20+
# hyperparameters sent by the client are passed as command-line arguments to the script.
21+
parser.add_argument('--epochs', type=int, default=1)
22+
# Data, model, and output directories
23+
parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
24+
parser.add_argument(
25+
"--smdebug_path",
26+
type=str,
27+
default=None,
28+
help="S3 URI of the bucket where tensor data will be stored.",
29+
)
30+
parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAINING'])
31+
parser.add_argument('--hosts', type=list, default=json.loads(os.environ['SM_HOSTS']))
32+
parser.add_argument('--current-host', type=str, default=os.environ['SM_CURRENT_HOST'])
33+
34+
return parser.parse_known_args()
35+
36+
37+
def _load_training_data(base_dir):
38+
x_train = np.load(os.path.join(base_dir, 'train', 'x_train.npy'))
39+
y_train = np.load(os.path.join(base_dir, 'train', 'y_train.npy'))
40+
return x_train, y_train
41+
42+
43+
def _load_testing_data(base_dir):
44+
x_test = np.load(os.path.join(base_dir, 'test', 'x_test.npy'))
45+
y_test = np.load(os.path.join(base_dir, 'test', 'y_test.npy'))
46+
return x_test, y_test
47+
48+
49+
def create_smdebug_hook(out_dir):
50+
include_collections = [
51+
CollectionKeys.WEIGHTS,
52+
CollectionKeys.BIASES,
53+
CollectionKeys.GRADIENTS,
54+
CollectionKeys.LOSSES,
55+
CollectionKeys.OUTPUTS,
56+
CollectionKeys.METRICS,
57+
CollectionKeys.LOSSES,
58+
CollectionKeys.OPTIMIZER_VARIABLES,
59+
]
60+
save_config = SaveConfig(save_interval=3)
61+
hook = smd.KerasHook(
62+
out_dir,
63+
save_config=save_config,
64+
include_collections=include_collections,
65+
reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS),
66+
)
67+
return hook
68+
69+
70+
args, unknown = _parse_args()
71+
72+
hook = create_smdebug_hook(args.smdebug_path)
73+
hooks = [hook]
74+
75+
model = tf.keras.models.Sequential([
76+
tf.keras.layers.Flatten(input_shape=(28, 28)),
77+
tf.keras.layers.Dense(512, activation=tf.nn.relu),
78+
tf.keras.layers.Dropout(0.2),
79+
tf.keras.layers.Dense(10, activation=tf.nn.softmax)
80+
])
81+
82+
model.compile(optimizer='adam',
83+
loss='sparse_categorical_crossentropy',
84+
metrics=['accuracy'])
85+
x_train, y_train = _load_training_data(args.train)
86+
x_test, y_test = _load_testing_data(args.train)
87+
model.fit(x_train, y_train, epochs=args.epochs, callbacks=hooks)
88+
model.evaluate(x_test, y_test, callbacks=hooks)
89+
90+
if args.current_host == args.hosts[0]:
91+
model.save(os.path.join('/opt/ml/model', 'my_model.h5'))
92+
93+
print("Created the trial with out_dir {0}".format(args.smdebug_path))
94+
trial = create_trial(args.smdebug_path)
95+
assert trial
96+
97+
print(f"trial.tensor_names() = {trial.tensor_names()}")
98+
99+
weights_tensors = hook.collection_manager.get("weights").tensor_names
100+
assert len(weights_tensors) > 0
101+
102+
losses_tensors = hook.collection_manager.get("losses").tensor_names
103+
assert len(losses_tensors) > 0

0 commit comments

Comments
 (0)