Merge pull request #1132 from Kaggle/tpu-1vm-image

djherbis · web-flow · commit cdba7d111437 · 2022-05-16T12:56:25.000-04:00
Get tensorflow, jax, and pytorch working on TPU1VM
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -56,6 +56,22 @@ pipeline {
             '''
           }
         }
+        stage('tensorflow TPU') {
+          options {
+            timeout(time: 180, unit: 'MINUTES')
+          }
+          steps {
+            sh '''#!/bin/bash
+              set -exo pipefail
+              source tpu/config.txt
+              cd packages/
+              ./build_package --base-image gcr.io/kaggle-images/python:${BASE_IMAGE_TAG} \
+                --package tpu-tensorflow \
+                --version $TENSORFLOW_VERSION \
+                --push
+            '''
+          }
+        }
       }
     }
     stage('Build/Test/Diff') {
@@ -150,7 +166,34 @@ pipeline {
               }
             }
           }
-        } 
+        }
+        stage('TPU VM') {
+          stages {
+            stage('Build Tensorflow TPU Image') {
+              options {
+                timeout(time: 20, unit: 'MINUTES')
+              }
+              steps {
+                sh '''#!/bin/bash
+                  set -exo pipefail
+
+                  ./tpu/build | ts
+                  ./push --tpu ${PRETEST_TAG}
+                '''
+              }
+            }
+            stage('Diff TPU VM Image') {
+              steps {
+                sh '''#!/bin/bash
+                set -exo pipefail
+
+                docker pull gcr.io/kaggle-private-byod/python-tpuvm:${PRETEST_TAG}
+                ./diff --tpu --target gcr.io/kaggle-private-byod/python-tpuvm:${PRETEST_TAG}
+              '''
+              }
+            }
+          }
+        }
       }
     }
 
@@ -161,6 +204,7 @@ pipeline {
 
           gcloud container images add-tag gcr.io/kaggle-images/python:${PRETEST_TAG} gcr.io/kaggle-images/python:${STAGING_TAG}
           gcloud container images add-tag gcr.io/kaggle-private-byod/python:${PRETEST_TAG} gcr.io/kaggle-private-byod/python:${STAGING_TAG}
+          gcloud container images add-tag gcr.io/kaggle-private-byod/python-tpuvm:${PRETEST_TAG} gcr.io/kaggle-private-byod/python-tpuvm:${STAGING_TAG}
         '''
       }
     }
diff --git a/diff b/diff
@@ -32,6 +32,10 @@ while :; do
             BASE_IMAGE_TAG='gcr.io/kaggle-private-byod/python:latest'
             TARGET_IMAGE_TAG='kaggle/python-gpu-build'
             ;;
+        -x|--tpu)
+            BASE_IMAGE_TAG='gcr.io/kaggle-private-byod/python-tpuvm:latest'
+            TARGET_IMAGE_TAG='kaggle/python-tpuvm-build'
+            ;;
         -b|--base)
             if [[ -z "$2" ]]; then
                 usage
diff --git a/packages/build_package b/packages/build_package
@@ -117,6 +117,8 @@ fi
 
 # Keep only `tf2-gpu.2-6:m80` in `gcr.io/deeplearning-platform-release/tf2-gpu.2-6:m80` 
 TAG=${BASE_IMAGE/gcr.io\/deeplearning-platform-release\//}
+# Keep only `python:v108` in `gcr.io/kaggle-images/python:v108`
+TAG=${TAG/gcr.io\/kaggle-images\//}
 # Replace the `:` in `tf2-gpu.2-6:m80` by `-`
 TAG=${TAG/:/-}
 # Append the package version
diff --git a/packages/tpu-tensorflow.Dockerfile b/packages/tpu-tensorflow.Dockerfile
@@ -1,6 +1,8 @@
-ARG BASE_IMAGE_TAG
+ARG BASE_IMAGE
 
-FROM gcr.io/kaggle-images/python:${BASE_IMAGE_TAG} AS builder
+FROM ${BASE_IMAGE} AS builder
+
+ARG PACKAGE_VERSION
 
 # Use Bazelisk to ensure the proper bazel version is used.
 RUN cd /usr/local/src && \
@@ -12,12 +14,12 @@ RUN cd /usr/local/src && \
 RUN cd /usr/local/src && \
     git clone https://github.com/tensorflow/tensorflow && \
     cd tensorflow && \
-    git checkout tags/v${TENSORFLOW_VERSION} && \
+    git checkout tags/v${PACKAGE_VERSION} && \
     # TODO(rosbo): Is it really needed?
     pip install keras_applications --no-deps && \
     pip install keras_preprocessing --no-deps
 
-# Create a TensorFlow wheel for CPU
+# Create a TensorFlow wheel for TPU
 RUN cd /usr/local/src/tensorflow && \
     cat /dev/null | ./configure && \
     bazel build \
@@ -32,7 +34,22 @@ RUN cd /usr/local/src/tensorflow && \
 RUN cd /usr/local/src/tensorflow && \
     bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg
 
-# TODO(b/152075195): Will likely need to install custom build for TFA & tensorflow-gcs-config
+# Build TensorFlow addons library against TensorFlow CPU.
+#RUN cd /usr/local/src/ && \
+#    git clone https://github.com/tensorflow/addons && \
+#    cd addons && \
+#    git checkout tags/v0.12.1 && \
+#    python ./configure.py && \
+#    bazel build --enable_runfiles build_pip_pkg && \
+#    bazel-bin/build_pip_pkg /tmp/tfa_cpu && \
+#    bazel clean
+
+# Build tensorflow_gcs_config library against TensorFlow CPU.
+#ADD tensorflow-gcs-config /usr/local/src/tensorflow_gcs_config/
+#RUN cd /usr/local/src/tensorflow_gcs_config && \
+#    apt-get install -y libcurl4-openssl-dev && \
+#    python setup.py bdist_wheel -d /tmp/tensorflow_gcs_config && \
+#    bazel clean
 
 # Use multi-stage builds to minimize image output size.
 FROM alpine:latest
diff --git a/push b/push
@@ -8,6 +8,7 @@ Push a newly-built image with the given LABEL to gcr.io and DockerHub.
 
 Options:
     -g, --gpu                   Push the image with GPU support.
+    -t, --tpu                   Push the image with GPU support.
     -s, --source-image IMAGE    Tag for the source image. 
 EOF
 }
@@ -26,6 +27,10 @@ while :; do
             SOURCE_IMAGE_TAG='kaggle/python-gpu-build:latest'
             TARGET_IMAGE='gcr.io/kaggle-private-byod/python'
             ;;
+        -t|--tpu)
+            SOURCE_IMAGE_TAG='kaggle/python-tpuvm-build:latest'
+            TARGET_IMAGE='gcr.io/kaggle-private-byod/python-tpuvm'
+            ;;
         -s|--source-image)
             if [[ -z $2 ]]; then
                 usage
diff --git a/tests/common.py b/tests/common.py
@@ -4,3 +4,4 @@
 import unittest
 
 gpu_test = unittest.skipIf(len(os.environ.get('CUDA_VERSION', '')) == 0, 'Not running GPU tests')
+tpu_test = unittest.skipIf(len(os.environ.get('ISTPUVM', '')) == 0, 'Not running TPU tests')
diff --git a/tpu/Dockerfile b/tpu/Dockerfile
@@ -1,13 +1,27 @@
 ARG BASE_IMAGE_TAG
 ARG LIBTPU_IMAGE_TAG
-ARG TENSORFLOW_WHL_IMAGE_TAG
+ARG TENSORFLOW_VERSION
+ARG TORCH_VERSION
 
 FROM gcr.io/cloud-tpu-v2-images/libtpu:${LIBTPU_IMAGE_TAG} as libtpu
-FROM gcr.io/kaggle-images/python-tpu-tensorflow-whl:${TENSORFLOW_WHL_IMAGE_TAG} AS tensorflow_whl
+FROM gcr.io/kaggle-images/python-tpu-tensorflow-whl:python-${BASE_IMAGE_TAG}-${TENSORFLOW_VERSION} AS tensorflow_whl
 FROM gcr.io/kaggle-images/python:${BASE_IMAGE_TAG}
 
+ENV ISTPUVM=1
+
 COPY --from=libtpu /libtpu.so /lib
 
 COPY --from=tensorflow_whl /tmp/tensorflow_pkg/tensorflow*.whl /tmp/tensorflow_pkg/
 RUN pip install /tmp/tensorflow_pkg/tensorflow*.whl && \
-    rm -rf /tmp/tensorflow_pkg
+    rm -rf /tmp/tensorflow_pkg && \
+    /tmp/clean-layer.sh
+
+# https://cloud.google.com/tpu/docs/pytorch-xla-ug-tpu-vm#changing_pytorch_version
+RUN pip uninstall -y torch && \
+    pip install torch==${TORCH_VERSION} && \
+    pip install torch_xla[tpuvm] -f https://storage.googleapis.com/tpu-pytorch/wheels/tpuvm/torch_xla-${TORCH_VERSION}-cp37-cp37m-linux_x86_64.whl && \
+    /tmp/clean-layer.sh
+
+# https://cloud.google.com/tpu/docs/jax-quickstart-tpu-vm#install_jax_on_your_cloud_tpu_vm
+RUN pip install "jax[tpu]>=0.2.16" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html && \
+    /tmp/clean-layer.sh
diff --git a/tpu/build b/tpu/build
@@ -0,0 +1,57 @@
+#!/bin/bash
+set -e
+
+usage() {
+cat << EOF
+Usage: $0 [OPTIONS]
+Build a new Python TPU 1VM Docker image.
+
+Options:
+    -c, --use-cache           Use layer cache when building a new image.
+EOF
+}
+
+CACHE_FLAG='--no-cache'
+DOCKERFILE='Dockerfile'
+IMAGE_TAG='kaggle/python-tpuvm-build'
+BUILD_ARGS=''
+
+while :; do
+    case "$1" in 
+        -h|--help)
+            usage
+            exit
+            ;;
+        -c|--use-cache)
+            CACHE_FLAG=''
+            ;;
+        -?*)
+            usage
+            printf 'ERROR: Unknown option: %s\n' "$1" >&2
+            exit
+            ;;
+        *)            
+            break
+    esac
+
+    shift
+done
+
+BUILD_ARGS+=" --build-arg GIT_COMMIT=$(git rev-parse HEAD)"
+BUILD_ARGS+=" --build-arg BUILD_DATE=$(date '+%Y%m%d-%H%M%S')"
+
+# Read build args from config.txt file.
+SRCDIR=$(dirname "${BASH_SOURCE[0]}")
+for l in `cat ${SRCDIR}/config.txt`; do
+    BUILD_ARGS+=" --build-arg $l"
+done
+
+readonly CACHE_FLAG
+readonly DOCKERFILE
+readonly IMAGE_TAG
+readonly BUILD_ARGS
+
+DOCKERFILE_PATH="$SRCDIR/$DOCKERFILE"
+
+set -x
+docker build --rm --pull $CACHE_FLAG -t "$IMAGE_TAG" -f "$DOCKERFILE_PATH" $BUILD_ARGS .
diff --git a/tpu/config.txt b/tpu/config.txt
@@ -0,0 +1,5 @@
+# TODO(b/213335159): Use ci-pretest for BASE_IMAGE_TAG once stable.
+BASE_IMAGE_TAG=v108
+LIBTPU_IMAGE_TAG=libtpu_1.1.0_RC00
+TENSORFLOW_VERSION=2.8.0
+TORCH_VERSION=1.11.0

Original file line number	Diff line number	Diff line change
`@@ -4,3 +4,4 @@`
`4`	`4`	`import unittest`
`5`	`5`
`6`	`6`	`gpu_test = unittest.skipIf(len(os.environ.get('CUDA_VERSION', '')) == 0, 'Not running GPU tests')`
	`7`	`+tpu_test = unittest.skipIf(len(os.environ.get('ISTPUVM', '')) == 0, 'Not running TPU tests')`