horovod_mnist rewrite (#252)

sboshin · chuyang-deng · commit 7768c2b632b3 · 2019-12-13T15:13:05.000-08:00
* fix: tensorflow-2.0 library code changes (#247) * change: tensorflow-2.0 tests * fix: tensorflow-2.0 library code changes * remove >=2.0 off tensorflow restrictions * fix: update mnist scripts for tf-2.0 * add dockerfiles * fix: update scripts to support tf-2.0 (#250) * Upgrading Keras * Upgrading horovod_mnist for v2, based on horovod mnist example for tf2 on horovod github * Upgrading Keras * Upgrading horovod_mnist for v2, based on horovod mnist example for tf2 on horovod github
diff --git a/docker/2.0.0/py3/Dockerfile.gpu b/docker/2.0.0/py3/Dockerfile.gpu
@@ -143,7 +143,7 @@ RUN ${PIP} install --no-cache-dir -U \
     keras_preprocessing==1.1.0 \
     requests==2.22.0 \
     keras==2.3.1 \
-    awscli \
+    awscli==1.16.196 \
     mpi4py==3.0.2 \
     "sagemaker-tensorflow>=2.0,<2.1" \
     # Let's install TensorFlow separately in the end to avoid
diff --git a/test/resources/mnist/horovod_mnist.py b/test/resources/mnist/horovod_mnist.py
@@ -10,120 +10,84 @@
 # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 # ANY KIND, either express or implied. See the License for the specific
 # language governing permissions and limitations under the License.
-from __future__ import absolute_import, print_function
-
 import os
-import subprocess
-
-import keras
-from keras.datasets import mnist
-from keras.models import Sequential
-from keras.layers import Dense, Dropout, Flatten
-from keras.layers import Conv2D, MaxPooling2D
-from keras import backend as K
 import tensorflow as tf
-import horovod.keras as hvd
-
+import horovod.tensorflow as hvd
 
 # Horovod: initialize Horovod.
 hvd.init()
 
 # Horovod: pin GPU to be used to process local rank (one GPU per process)
-config = tf.compat.v1.ConfigProto()
-config.gpu_options.allow_growth = True
-config.gpu_options.visible_device_list = str(hvd.local_rank())
-K.set_session(tf.compat.v1.Session(config=config))
-
-batch_size = 128
-num_classes = 10
-
-epochs = 1
-
-# Input image dimensions
-img_rows, img_cols = 28, 28
-
-# The data, shuffled and split between train and test sets
-(x_train, y_train), (x_test, y_test) = mnist.load_data()
-
-x_train = x_train[:600]
-y_train = y_train[:600]
-x_test = x_test[:100]
-y_test = y_test[:100]
-
-if K.image_data_format() == 'channels_first':
-    x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
-    x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
-    input_shape = (1, img_rows, img_cols)
-else:
-    x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
-    x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
-    input_shape = (img_rows, img_cols, 1)
-
-x_train = x_train.astype('float32')
-x_test = x_test.astype('float32')
-x_train /= 255
-x_test /= 255
-print('x_train shape:', x_train.shape)
-print(x_train.shape[0], 'train samples')
-print(x_test.shape[0], 'test samples')
-
-# Convert class vectors to binary class matrices
-y_train = keras.utils.to_categorical(y_train, num_classes)
-y_test = keras.utils.to_categorical(y_test, num_classes)
-
-model = Sequential()
-model.add(Conv2D(32, kernel_size=(3, 3),
-                 activation='relu',
-                 input_shape=input_shape))
-model.add(Conv2D(64, (3, 3), activation='relu'))
-model.add(MaxPooling2D(pool_size=(2, 2)))
-model.add(Dropout(0.25))
-model.add(Flatten())
-model.add(Dense(128, activation='relu'))
-model.add(Dropout(0.5))
-model.add(Dense(num_classes, activation='softmax'))
+gpus = tf.config.experimental.list_physical_devices('GPU')
+for gpu in gpus:
+  tf.config.experimental.set_memory_growth(gpu, True)
+if gpus:
+  tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
+
+(mnist_images, mnist_labels), _ = \
+    tf.keras.datasets.mnist.load_data(path='mnist-%d.npz' % hvd.rank())
+
+dataset = tf.data.Dataset.from_tensor_slices(
+    (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32),
+     tf.cast(mnist_labels, tf.int64))
+)
+dataset = dataset.repeat().shuffle(10000).batch(128)
+
+mnist_model = tf.keras.Sequential([
+    tf.keras.layers.Conv2D(32, [3, 3], activation='relu'),
+    tf.keras.layers.Conv2D(64, [3, 3], activation='relu'),
+    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
+    tf.keras.layers.Dropout(0.25),
+    tf.keras.layers.Flatten(),
+    tf.keras.layers.Dense(128, activation='relu'),
+    tf.keras.layers.Dropout(0.5),
+    tf.keras.layers.Dense(10, activation='softmax')
+])
+loss = tf.losses.SparseCategoricalCrossentropy()
 
 # Horovod: adjust learning rate based on number of GPUs.
-opt = keras.optimizers.Adadelta(1.0 * hvd.size())
+opt = tf.optimizers.Adam(0.001 * hvd.size())
 
-# Horovod: add Horovod Distributed Optimizer.
-opt = hvd.DistributedOptimizer(opt)
+checkpoint_dir = './checkpoints'
+checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt)
 
-model.compile(loss=keras.losses.categorical_crossentropy,
-              optimizer=opt,
-              metrics=['accuracy'])
 
-callbacks = [
-    # Horovod: broadcast initial variable states from rank 0 to all other processes.
-    # This is necessary to ensure consistent initialization of all workers when
-    # training is started with random weights or restored from a checkpoint.
-    hvd.callbacks.BroadcastGlobalVariablesCallback(0),
-]
+@tf.function
+def training_step(images, labels, first_batch):
+  with tf.GradientTape() as tape:
+    probs = mnist_model(images, training=True)
+    loss_value = loss(labels, probs)
 
-# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
-if hvd.rank() == 0:
-    callbacks.append(keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5'))
+  # Horovod: add Horovod Distributed GradientTape.
+  tape = hvd.DistributedGradientTape(tape)
 
-model.fit(x_train, y_train,
-          batch_size=batch_size,
-          callbacks=callbacks,
-          epochs=epochs,
-          verbose=1,
-          validation_data=(x_test, y_test))
-score = model.evaluate(x_test, y_test, verbose=0)
-print('Test loss:', score[0])
-print('Test accuracy:', score[1])
+  grads = tape.gradient(loss_value, mnist_model.trainable_variables)
+  opt.apply_gradients(zip(grads, mnist_model.trainable_variables))
 
+  # Horovod: broadcast initial variable states from rank 0 to all other processes.
+  # This is necessary to ensure consistent initialization of all workers when
+  # training is started with random weights or restored from a checkpoint.
+  #
+  # Note: broadcast should be done after the first gradient step to ensure optimizer
+  # initialization.
+  if first_batch:
+    hvd.broadcast_variables(mnist_model.variables, root_rank=0)
+    hvd.broadcast_variables(opt.variables(), root_rank=0)
 
-if hvd.rank() == 0:
-    # Exports the keras model as TensorFlow Serving Saved Model
-    with K.get_session() as session:
+  return loss_value
 
-        init = tf.compat.v1.global_variables_initializer()
-        session.run(init)
 
-        tf.compat.v1.saved_model.simple_save(
-            session,
-            os.path.join('/opt/ml/model/mnist/1'),
-            inputs={'input_image': model.input},
-            outputs={t.name: t for t in model.outputs})
+# Horovod: adjust number of steps based on number of GPUs.
+for batch, (images, labels) in enumerate(dataset.take(600 // hvd.size())):
+  loss_value = training_step(images, labels, batch == 0)
+
+  if batch % 10 == 0 and hvd.local_rank() == 0:
+    print('Step #%d\tLoss: %.6f' % (batch, loss_value))
+
+# Horovod: save checkpoints only on worker 0 to prevent other workers from
+# corrupting it.
+if hvd.rank() == 0:
+  # Export the keras model as Tensorflow SavedModelBundle
+  mnist_model.save(
+      os.path.join('/opt/ml/model/mnist/1'),
+      save_format='tf')