|
| 1 | +import argparse |
| 2 | +import json |
| 3 | +import os |
| 4 | +import sys |
| 5 | + |
| 6 | +import numpy as np |
| 7 | +import tensorflow.compat.v1 as tf |
| 8 | + |
| 9 | +import smdebug.tensorflow as smd |
| 10 | +from smdebug.core.collection import CollectionKeys |
| 11 | +from smdebug.core.reduction_config import ALLOWED_NORMS, ALLOWED_REDUCTIONS |
| 12 | +from smdebug.tensorflow import ReductionConfig, SaveConfig |
| 13 | +from smdebug.trials import create_trial |
| 14 | + |
| 15 | + |
| 16 | +def _parse_args(): |
| 17 | + |
| 18 | + parser = argparse.ArgumentParser() |
| 19 | + |
| 20 | + # hyperparameters sent by the client are passed as command-line arguments to the script. |
| 21 | + parser.add_argument('--epochs', type=int, default=1) |
| 22 | + # Data, model, and output directories |
| 23 | + parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR']) |
| 24 | + parser.add_argument( |
| 25 | + "--smdebug_path", |
| 26 | + type=str, |
| 27 | + default=None, |
| 28 | + help="S3 URI of the bucket where tensor data will be stored.", |
| 29 | + ) |
| 30 | + parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAINING']) |
| 31 | + parser.add_argument('--hosts', type=list, default=json.loads(os.environ['SM_HOSTS'])) |
| 32 | + parser.add_argument('--current-host', type=str, default=os.environ['SM_CURRENT_HOST']) |
| 33 | + |
| 34 | + return parser.parse_known_args() |
| 35 | + |
| 36 | + |
| 37 | +def _load_training_data(base_dir): |
| 38 | + x_train = np.load(os.path.join(base_dir, 'train', 'x_train.npy')) |
| 39 | + y_train = np.load(os.path.join(base_dir, 'train', 'y_train.npy')) |
| 40 | + return x_train, y_train |
| 41 | + |
| 42 | + |
| 43 | +def _load_testing_data(base_dir): |
| 44 | + x_test = np.load(os.path.join(base_dir, 'test', 'x_test.npy')) |
| 45 | + y_test = np.load(os.path.join(base_dir, 'test', 'y_test.npy')) |
| 46 | + return x_test, y_test |
| 47 | + |
| 48 | + |
| 49 | +def create_smdebug_hook(out_dir): |
| 50 | + include_collections = [ |
| 51 | + CollectionKeys.WEIGHTS, |
| 52 | + CollectionKeys.BIASES, |
| 53 | + CollectionKeys.GRADIENTS, |
| 54 | + CollectionKeys.LOSSES, |
| 55 | + CollectionKeys.OUTPUTS, |
| 56 | + CollectionKeys.METRICS, |
| 57 | + CollectionKeys.LOSSES, |
| 58 | + CollectionKeys.OPTIMIZER_VARIABLES, |
| 59 | + ] |
| 60 | + save_config = SaveConfig(save_interval=3) |
| 61 | + hook = smd.KerasHook( |
| 62 | + out_dir, |
| 63 | + save_config=save_config, |
| 64 | + include_collections=include_collections, |
| 65 | + reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS), |
| 66 | + ) |
| 67 | + return hook |
| 68 | + |
| 69 | + |
| 70 | +args, unknown = _parse_args() |
| 71 | + |
| 72 | +hook = create_smdebug_hook(args.smdebug_path) |
| 73 | +hooks = [hook] |
| 74 | + |
| 75 | +model = tf.keras.models.Sequential([ |
| 76 | + tf.keras.layers.Flatten(input_shape=(28, 28)), |
| 77 | + tf.keras.layers.Dense(512, activation=tf.nn.relu), |
| 78 | + tf.keras.layers.Dropout(0.2), |
| 79 | + tf.keras.layers.Dense(10, activation=tf.nn.softmax) |
| 80 | +]) |
| 81 | + |
| 82 | +model.compile(optimizer='adam', |
| 83 | + loss='sparse_categorical_crossentropy', |
| 84 | + metrics=['accuracy']) |
| 85 | +x_train, y_train = _load_training_data(args.train) |
| 86 | +x_test, y_test = _load_testing_data(args.train) |
| 87 | +model.fit(x_train, y_train, epochs=args.epochs, callbacks=hooks) |
| 88 | +model.evaluate(x_test, y_test, callbacks=hooks) |
| 89 | + |
| 90 | +if args.current_host == args.hosts[0]: |
| 91 | + model.save(os.path.join('/opt/ml/model', 'my_model.h5')) |
| 92 | + |
| 93 | +print("Created the trial with out_dir {0}".format(args.smdebug_path)) |
| 94 | +trial = create_trial(args.smdebug_path) |
| 95 | +assert trial |
| 96 | + |
| 97 | +print(f"trial.tensor_names() = {trial.tensor_names()}") |
| 98 | + |
| 99 | +weights_tensors = hook.collection_manager.get("weights").tensor_names |
| 100 | +assert len(weights_tensors) > 0 |
| 101 | + |
| 102 | +losses_tensors = hook.collection_manager.get("losses").tensor_names |
| 103 | +assert len(losses_tensors) > 0 |
0 commit comments