Merge branch 'main' into upgrade-tf2.9

djherbis · web-flow · commit c72a84a553ac · 2022-10-25T13:37:12.000-04:00
diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl
@@ -497,6 +497,8 @@ RUN pip install flashtext && \
     pip install optuna && \
     pip install plotly_express && \
     pip install albumentations && \
+    # b/254245259 catalyst requires accelerate but it breaks with the version 0.13.1
+    pip install accelerate==0.12.0 && \
     # Breaks protobuf compatibiilty in newer versions:
     pip install catalyst tensorboardX==2.5.1 && \
     # b/206990323 osmx 1.1.2 requires numpy >= 1.21 which we don't want. 
@@ -533,6 +535,8 @@ RUN pip install flashtext && \
     pip install ipympl==0.7.0 && \
     pip install pandarallel && \
     pip install onnx && \
+    pip install tables && \
+    pip install openpyxl && \
     /tmp/clean-layer.sh
 
 # Download base easyocr models.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -56,22 +56,6 @@ pipeline {
             '''
           }
         }
-        stage('tensorflow TPU') {
-          options {
-            timeout(time: 240, unit: 'MINUTES')
-          }
-          steps {
-            sh '''#!/bin/bash
-              set -exo pipefail
-              source tpu/config.txt
-              cd packages/
-              ./build_package --base-image gcr.io/kaggle-images/python:${BASE_IMAGE_TAG} \
-                --package tpu-tensorflow \
-                --version $TENSORFLOW_VERSION \
-                --push
-            '''
-          }
-        }
       }
     }
     stage('Build/Test/Diff') {
@@ -142,17 +126,37 @@ pipeline {
               }
             }
             stage('Test GPU Image') {
-              options {
-                timeout(time: 20, unit: 'MINUTES')
-              }
-              steps {
-                sh '''#!/bin/bash
-                  set -exo pipefail
+              stages {
+                stage('Test on P100') {
+                  agent { label 'ephemeral-linux-gpu' }
+                  options {
+                    timeout(time: 20, unit: 'MINUTES')
+                  }
+                  steps {
+                    sh '''#!/bin/bash
+                      set -exo pipefail
 
-                  date
-                  docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG}
-                  ./test --gpu --image gcr.io/kaggle-private-byod/python:${PRETEST_TAG}
-                '''
+                      date
+                      docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG}
+                      ./test --gpu --image gcr.io/kaggle-private-byod/python:${PRETEST_TAG}
+                    '''
+                  }
+                }
+                stage('Test on T4x2') {
+                  agent { label 'ephemeral-linux-gpu-t4x2' }
+                  options {
+                    timeout(time: 20, unit: 'MINUTES')
+                  }
+                  steps {
+                    sh '''#!/bin/bash
+                      set -exo pipefail
+
+                      date
+                      docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG}
+                      ./test --gpu --image gcr.io/kaggle-private-byod/python:${PRETEST_TAG}
+                    '''
+                  }
+                }
               }
             }
             stage('Diff GPU Image') {
@@ -171,23 +175,12 @@ pipeline {
           stages {
             stage('Build Tensorflow TPU Image') {
               options {
-                timeout(time: 20, unit: 'MINUTES')
+                timeout(time: 60, unit: 'MINUTES')
               }
               steps {
                 sh '''#!/bin/bash
                   set -exo pipefail
 
-                  # Login to docker to get access to gcr.io/cloud-tpu-v2-images/libtpu
-                  # SA: jenkins-test@kaggle-playground-170215.iam.gserviceaccount.com
-                  # To grant access to a SA, start a TPU VM with that SA once.
-                  # Disable echo to avoid printing sensitive tokens:
-                  set +x
-                  METADATA=http://metadata.google.internal/computeMetadata/v1
-                  SVC_ACCT=$METADATA/instance/service-accounts/default
-                  ACCESS_TOKEN=$(/usr/bin/curl -s -H 'Metadata-Flavor: Google' $SVC_ACCT/token | cut -d'"' -f 4)
-                  docker login --username oauth2accesstoken --password $ACCESS_TOKEN https://gcr.io
-                  set -x
-
                   ./tpu/build | ts
                   ./push --tpu ${PRETEST_TAG}
                 '''
diff --git a/packages/tpu-tensorflow.Dockerfile b/packages/tpu-tensorflow.Dockerfile
diff --git a/test_pytables.py b/test_pytables.py
@@ -0,0 +1,13 @@
+import unittest
+
+import pandas as pd
+from pandas.testing import assert_frame_equal
+
+
+class TestPandasPytables(unittest.TestCase):
+
+    def test_rw_hd5(self):
+        want = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z'])
+        want.to_hdf('./want.h5', 'data')
+        got = pd.read_hdf('./want.h5')
+        assert_frame_equal(want, got)
diff --git a/tests/test_openpyxl.py b/tests/test_openpyxl.py
@@ -0,0 +1,13 @@
+import unittest
+
+import pandas as pd
+from pandas.testing import assert_frame_equal
+
+
+class TestPandasOpenPyXL(unittest.TestCase):
+
+    def test_rw_excel(self):
+        want = pd.DataFrame([[1, 10, 'a']], columns=['x', 'y', 'z'])
+        want.to_excel('./want.xlsx', index=False, engine="openpyxl")
+        got = pd.read_excel('./want.xlsx', engine="openpyxl")
+        assert_frame_equal(want, got)
diff --git a/tpu/Dockerfile b/tpu/Dockerfile
@@ -1,43 +1,65 @@
-ARG BASE_IMAGE_TAG
-ARG TENSORFLOW_VERSION
-
-FROM gcr.io/kaggle-images/python-tpu-tensorflow-whl:python-${BASE_IMAGE_TAG}-${TENSORFLOW_VERSION} AS tensorflow_whl
-FROM gcr.io/kaggle-images/python:${BASE_IMAGE_TAG}
+FROM python:3.8
 
 # We need to define the ARG here to get the ARG below the FROM statement to access it within this build context
 # See: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact
 ARG TORCH_VERSION
+ARG TENSORFLOW_VERSION
 
 ENV ISTPUVM=1
 
-COPY --from=tensorflow_whl /tmp/tensorflow_pkg/tensorflow*.whl /tmp/tensorflow_pkg/
-RUN pip install /tmp/tensorflow_pkg/tensorflow*.whl && \
-    rm -rf /tmp/tensorflow_pkg && \
-    /tmp/clean-layer.sh
+ADD patches/nbconvert-extensions.tpl /opt/kaggle/nbconvert-extensions.tpl
+ADD patches/template_conf.json /opt/kaggle/conf.json
+
+# Tensorflow wheel:
+# When tensorflow is compatible with being installed alongside JAX/Pytorch then we no longer need to include the wheel and can install it directly.
+# RUN pip install https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/tensorflow/tf-${TENSORFLOW_VERSION}/tensorflow-${TENSORFLOW_VERSION}-cp38-cp38-linux_x86_64.whl
+RUN mkdir -p /lib/wheels && curl --output /lib/wheels/tensorflow-${TENSORFLOW_VERSION}-cp38-cp38-linux_x86_64.whl https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/tensorflow/tf-${TENSORFLOW_VERSION}/tensorflow-${TENSORFLOW_VERSION}-cp38-cp38-linux_x86_64.whl
+RUN curl --output /lib/libtpu.so https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/libtpu/1.3.0/libtpu.so
 
 # LIBTPU installed here:
-ENV DEFAULT_LIBTPU=/opt/conda/lib/python3.7/site-packages/libtpu/libtpu.so
-ENV PYTORCH_LIBTPU=/opt/conda/lib/python3.7/site-packages/libtpu/torch-libtpu.so
-ENV JAX_LIBTPU=/opt/conda/lib/python3.7/site-packages/libtpu/jax-libtpu.so
+ENV PIP_LIBTPU=/usr/local/lib/python3.8/site-packages/libtpu/libtpu.so
+ENV DEFAULT_LIBTPU=/lib/libtpu.so
+ENV PYTORCH_LIBTPU=/lib/torch-libtpu.so
+ENV JAX_LIBTPU=/lib/jax-libtpu.so
+
+# Install JAX & related packages
+# https://cloud.google.com/tpu/docs/jax-quickstart-tpu-vm#install_jax_on_your_cloud_tpu_vm
+RUN pip install "jax[tpu]==0.3.10" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html trax flax optax elegy git+https://github.com/deepmind/dm-haiku jraph distrax
+
+RUN cp $PIP_LIBTPU $JAX_LIBTPU
 
+# Install Pytorch & related packages
 # https://cloud.google.com/tpu/docs/pytorch-xla-ug-tpu-vm#changing_pytorch_version
-RUN pip uninstall -y torch && \
-    pip install torch==${TORCH_VERSION} && \
-    # The URL doesn't include patch version. i.e. must use 1.11 instead of 1.11.0
-    pip install torch_xla[tpuvm] -f https://storage.googleapis.com/tpu-pytorch/wheels/tpuvm/torch_xla-${TORCH_VERSION%.*}-cp37-cp37m-linux_x86_64.whl && \
-    cp $DEFAULT_LIBTPU $PYTORCH_LIBTPU && \
-    /tmp/clean-layer.sh
+# The URL doesn't include patch version. i.e. must use 1.11 instead of 1.11.0
+RUN pip install torch==${TORCH_VERSION} torch_xla[tpuvm] -f https://storage.googleapis.com/tpu-pytorch/wheels/tpuvm/torch_xla-${TORCH_VERSION%.*}-cp38-cp38-linux_x86_64.whl torchvision==0.12.0 torchtext==0.12.0 torchaudio==0.11.0
 
-# https://cloud.google.com/tpu/docs/jax-quickstart-tpu-vm#install_jax_on_your_cloud_tpu_vm
-RUN pip install "jax[tpu]>=0.2.16" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html && \
-    cp $DEFAULT_LIBTPU $JAX_LIBTPU && \
-    /tmp/clean-layer.sh
+RUN cp $PIP_LIBTPU $PYTORCH_LIBTPU
 
 # Monkey-patch TF, JAX & PYTORCH to load the correct libtpu.so when they are imported:
-RUN sed -i "s|^\(\(.*\)libtpu.configure_library_path.*\)|\1\n\2os.environ['TPU_LIBRARY_PATH'] = '${PYTORCH_LIBTPU}'|" /opt/conda/lib/python3.7/site-packages/torch_xla/__init__.py && \
-    sed -i "s|^\(\(.*\)libtpu.configure_library_path.*\)|\1\n\2os.environ['TPU_LIBRARY_PATH'] = '${JAX_LIBTPU}'|" /opt/conda/lib/python3.7/site-packages/jax/_src/cloud_tpu_init.py && \
-    sed -i "1s/^/from jax._src.cloud_tpu_init import cloud_tpu_init\ncloud_tpu_init()\n/" /opt/conda/lib/python3.7/site-packages/tensorflow/__init__.py
+RUN sed -i "s|^\(\(.*\)libtpu.configure_library_path.*\)|\1\n\2os.environ['TPU_LIBRARY_PATH'] = '${PYTORCH_LIBTPU}'|" /usr/local/lib/python3.8/site-packages/torch_xla/__init__.py && \
+    sed -i "s|^\(\(.*\)libtpu.configure_library_path.*\)|\1\n\2os.environ['TPU_LIBRARY_PATH'] = '${JAX_LIBTPU}'|" /usr/local/lib/python3.8/site-packages/jax/_src/cloud_tpu_init.py
+
+# Packages needed by the Notebook editor:
+RUN pip install papermill jupyterlab python-lsp-server[all] jupyterlab-lsp
+
+# Additional useful packages should be added here:
+RUN pip install pandas
 
 # Set these env vars so that they don't produce errs calling the metadata server to load them:
 ENV TPU_ACCELERATOR_TYPE=v3-8
-ENV TPU_PROCESS_ADDRESSES=local
+ENV TPU_PROCESS_ADDRESSES=local
+
+# Metadata
+ARG GIT_COMMIT=unknown
+ARG BUILD_DATE=unknown
+
+LABEL git-commit=$GIT_COMMIT
+LABEL build-date=$BUILD_DATE
+ENV GIT_COMMIT=${GIT_COMMIT}
+ENV BUILD_DATE=${BUILD_DATE}
+
+LABEL tensorflow-version=$TENSORFLOW_VERSION
+LABEL kaggle-lang=python
+
+# Correlate current release with the git hash inside the kernel editor by running `!cat /etc/git_commit`.
+RUN echo "$GIT_COMMIT" > /etc/git_commit && echo "$BUILD_DATE" > /etc/build_date
diff --git a/tpu/config.txt b/tpu/config.txt
@@ -1,4 +1,2 @@
-# TODO(b/213335159): Use ci-pretest for BASE_IMAGE_TAG once stable.
-BASE_IMAGE_TAG=v115
-TENSORFLOW_VERSION=2.8.0
+TENSORFLOW_VERSION=2.9.1
 TORCH_VERSION=1.11.0