Skip to content

Commit c72a84a

Browse files
authored
Merge branch 'main' into upgrade-tf2.9
2 parents fbf984e + 3ed70bb commit c72a84a

File tree

7 files changed

+110
-123
lines changed

7 files changed

+110
-123
lines changed

Dockerfile.tmpl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -497,6 +497,8 @@ RUN pip install flashtext && \
497497
pip install optuna && \
498498
pip install plotly_express && \
499499
pip install albumentations && \
500+
# b/254245259 catalyst requires accelerate but it breaks with the version 0.13.1
501+
pip install accelerate==0.12.0 && \
500502
# Breaks protobuf compatibiilty in newer versions:
501503
pip install catalyst tensorboardX==2.5.1 && \
502504
# b/206990323 osmx 1.1.2 requires numpy >= 1.21 which we don't want.
@@ -533,6 +535,8 @@ RUN pip install flashtext && \
533535
pip install ipympl==0.7.0 && \
534536
pip install pandarallel && \
535537
pip install onnx && \
538+
pip install tables && \
539+
pip install openpyxl && \
536540
/tmp/clean-layer.sh
537541

538542
# Download base easyocr models.

Jenkinsfile

Lines changed: 31 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -56,22 +56,6 @@ pipeline {
5656
'''
5757
}
5858
}
59-
stage('tensorflow TPU') {
60-
options {
61-
timeout(time: 240, unit: 'MINUTES')
62-
}
63-
steps {
64-
sh '''#!/bin/bash
65-
set -exo pipefail
66-
source tpu/config.txt
67-
cd packages/
68-
./build_package --base-image gcr.io/kaggle-images/python:${BASE_IMAGE_TAG} \
69-
--package tpu-tensorflow \
70-
--version $TENSORFLOW_VERSION \
71-
--push
72-
'''
73-
}
74-
}
7559
}
7660
}
7761
stage('Build/Test/Diff') {
@@ -142,17 +126,37 @@ pipeline {
142126
}
143127
}
144128
stage('Test GPU Image') {
145-
options {
146-
timeout(time: 20, unit: 'MINUTES')
147-
}
148-
steps {
149-
sh '''#!/bin/bash
150-
set -exo pipefail
129+
stages {
130+
stage('Test on P100') {
131+
agent { label 'ephemeral-linux-gpu' }
132+
options {
133+
timeout(time: 20, unit: 'MINUTES')
134+
}
135+
steps {
136+
sh '''#!/bin/bash
137+
set -exo pipefail
151138
152-
date
153-
docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG}
154-
./test --gpu --image gcr.io/kaggle-private-byod/python:${PRETEST_TAG}
155-
'''
139+
date
140+
docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG}
141+
./test --gpu --image gcr.io/kaggle-private-byod/python:${PRETEST_TAG}
142+
'''
143+
}
144+
}
145+
stage('Test on T4x2') {
146+
agent { label 'ephemeral-linux-gpu-t4x2' }
147+
options {
148+
timeout(time: 20, unit: 'MINUTES')
149+
}
150+
steps {
151+
sh '''#!/bin/bash
152+
set -exo pipefail
153+
154+
date
155+
docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG}
156+
./test --gpu --image gcr.io/kaggle-private-byod/python:${PRETEST_TAG}
157+
'''
158+
}
159+
}
156160
}
157161
}
158162
stage('Diff GPU Image') {
@@ -171,23 +175,12 @@ pipeline {
171175
stages {
172176
stage('Build Tensorflow TPU Image') {
173177
options {
174-
timeout(time: 20, unit: 'MINUTES')
178+
timeout(time: 60, unit: 'MINUTES')
175179
}
176180
steps {
177181
sh '''#!/bin/bash
178182
set -exo pipefail
179183
180-
# Login to docker to get access to gcr.io/cloud-tpu-v2-images/libtpu
181-
# SA: jenkins-test@kaggle-playground-170215.iam.gserviceaccount.com
182-
# To grant access to a SA, start a TPU VM with that SA once.
183-
# Disable echo to avoid printing sensitive tokens:
184-
set +x
185-
METADATA=http://metadata.google.internal/computeMetadata/v1
186-
SVC_ACCT=$METADATA/instance/service-accounts/default
187-
ACCESS_TOKEN=$(/usr/bin/curl -s -H 'Metadata-Flavor: Google' $SVC_ACCT/token | cut -d'"' -f 4)
188-
docker login --username oauth2accesstoken --password $ACCESS_TOKEN https://gcr.io
189-
set -x
190-
191184
./tpu/build | ts
192185
./push --tpu ${PRETEST_TAG}
193186
'''

packages/tpu-tensorflow.Dockerfile

Lines changed: 0 additions & 56 deletions
This file was deleted.

test_pytables.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
import unittest
2+
3+
import pandas as pd
4+
from pandas.testing import assert_frame_equal
5+
6+
7+
class TestPandasPytables(unittest.TestCase):
8+
9+
def test_rw_hd5(self):
10+
want = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z'])
11+
want.to_hdf('./want.h5', 'data')
12+
got = pd.read_hdf('./want.h5')
13+
assert_frame_equal(want, got)

tests/test_openpyxl.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
import unittest
2+
3+
import pandas as pd
4+
from pandas.testing import assert_frame_equal
5+
6+
7+
class TestPandasOpenPyXL(unittest.TestCase):
8+
9+
def test_rw_excel(self):
10+
want = pd.DataFrame([[1, 10, 'a']], columns=['x', 'y', 'z'])
11+
want.to_excel('./want.xlsx', index=False, engine="openpyxl")
12+
got = pd.read_excel('./want.xlsx', engine="openpyxl")
13+
assert_frame_equal(want, got)

tpu/Dockerfile

Lines changed: 48 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,43 +1,65 @@
1-
ARG BASE_IMAGE_TAG
2-
ARG TENSORFLOW_VERSION
3-
4-
FROM gcr.io/kaggle-images/python-tpu-tensorflow-whl:python-${BASE_IMAGE_TAG}-${TENSORFLOW_VERSION} AS tensorflow_whl
5-
FROM gcr.io/kaggle-images/python:${BASE_IMAGE_TAG}
1+
FROM python:3.8
62

73
# We need to define the ARG here to get the ARG below the FROM statement to access it within this build context
84
# See: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact
95
ARG TORCH_VERSION
6+
ARG TENSORFLOW_VERSION
107

118
ENV ISTPUVM=1
129

13-
COPY --from=tensorflow_whl /tmp/tensorflow_pkg/tensorflow*.whl /tmp/tensorflow_pkg/
14-
RUN pip install /tmp/tensorflow_pkg/tensorflow*.whl && \
15-
rm -rf /tmp/tensorflow_pkg && \
16-
/tmp/clean-layer.sh
10+
ADD patches/nbconvert-extensions.tpl /opt/kaggle/nbconvert-extensions.tpl
11+
ADD patches/template_conf.json /opt/kaggle/conf.json
12+
13+
# Tensorflow wheel:
14+
# When tensorflow is compatible with being installed alongside JAX/Pytorch then we no longer need to include the wheel and can install it directly.
15+
# RUN pip install https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/tensorflow/tf-${TENSORFLOW_VERSION}/tensorflow-${TENSORFLOW_VERSION}-cp38-cp38-linux_x86_64.whl
16+
RUN mkdir -p /lib/wheels && curl --output /lib/wheels/tensorflow-${TENSORFLOW_VERSION}-cp38-cp38-linux_x86_64.whl https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/tensorflow/tf-${TENSORFLOW_VERSION}/tensorflow-${TENSORFLOW_VERSION}-cp38-cp38-linux_x86_64.whl
17+
RUN curl --output /lib/libtpu.so https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/libtpu/1.3.0/libtpu.so
1718

1819
# LIBTPU installed here:
19-
ENV DEFAULT_LIBTPU=/opt/conda/lib/python3.7/site-packages/libtpu/libtpu.so
20-
ENV PYTORCH_LIBTPU=/opt/conda/lib/python3.7/site-packages/libtpu/torch-libtpu.so
21-
ENV JAX_LIBTPU=/opt/conda/lib/python3.7/site-packages/libtpu/jax-libtpu.so
20+
ENV PIP_LIBTPU=/usr/local/lib/python3.8/site-packages/libtpu/libtpu.so
21+
ENV DEFAULT_LIBTPU=/lib/libtpu.so
22+
ENV PYTORCH_LIBTPU=/lib/torch-libtpu.so
23+
ENV JAX_LIBTPU=/lib/jax-libtpu.so
24+
25+
# Install JAX & related packages
26+
# https://cloud.google.com/tpu/docs/jax-quickstart-tpu-vm#install_jax_on_your_cloud_tpu_vm
27+
RUN pip install "jax[tpu]==0.3.10" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html trax flax optax elegy git+https://github.com/deepmind/dm-haiku jraph distrax
28+
29+
RUN cp $PIP_LIBTPU $JAX_LIBTPU
2230

31+
# Install Pytorch & related packages
2332
# https://cloud.google.com/tpu/docs/pytorch-xla-ug-tpu-vm#changing_pytorch_version
24-
RUN pip uninstall -y torch && \
25-
pip install torch==${TORCH_VERSION} && \
26-
# The URL doesn't include patch version. i.e. must use 1.11 instead of 1.11.0
27-
pip install torch_xla[tpuvm] -f https://storage.googleapis.com/tpu-pytorch/wheels/tpuvm/torch_xla-${TORCH_VERSION%.*}-cp37-cp37m-linux_x86_64.whl && \
28-
cp $DEFAULT_LIBTPU $PYTORCH_LIBTPU && \
29-
/tmp/clean-layer.sh
33+
# The URL doesn't include patch version. i.e. must use 1.11 instead of 1.11.0
34+
RUN pip install torch==${TORCH_VERSION} torch_xla[tpuvm] -f https://storage.googleapis.com/tpu-pytorch/wheels/tpuvm/torch_xla-${TORCH_VERSION%.*}-cp38-cp38-linux_x86_64.whl torchvision==0.12.0 torchtext==0.12.0 torchaudio==0.11.0
3035

31-
# https://cloud.google.com/tpu/docs/jax-quickstart-tpu-vm#install_jax_on_your_cloud_tpu_vm
32-
RUN pip install "jax[tpu]>=0.2.16" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html && \
33-
cp $DEFAULT_LIBTPU $JAX_LIBTPU && \
34-
/tmp/clean-layer.sh
36+
RUN cp $PIP_LIBTPU $PYTORCH_LIBTPU
3537

3638
# Monkey-patch TF, JAX & PYTORCH to load the correct libtpu.so when they are imported:
37-
RUN sed -i "s|^\(\(.*\)libtpu.configure_library_path.*\)|\1\n\2os.environ['TPU_LIBRARY_PATH'] = '${PYTORCH_LIBTPU}'|" /opt/conda/lib/python3.7/site-packages/torch_xla/__init__.py && \
38-
sed -i "s|^\(\(.*\)libtpu.configure_library_path.*\)|\1\n\2os.environ['TPU_LIBRARY_PATH'] = '${JAX_LIBTPU}'|" /opt/conda/lib/python3.7/site-packages/jax/_src/cloud_tpu_init.py && \
39-
sed -i "1s/^/from jax._src.cloud_tpu_init import cloud_tpu_init\ncloud_tpu_init()\n/" /opt/conda/lib/python3.7/site-packages/tensorflow/__init__.py
39+
RUN sed -i "s|^\(\(.*\)libtpu.configure_library_path.*\)|\1\n\2os.environ['TPU_LIBRARY_PATH'] = '${PYTORCH_LIBTPU}'|" /usr/local/lib/python3.8/site-packages/torch_xla/__init__.py && \
40+
sed -i "s|^\(\(.*\)libtpu.configure_library_path.*\)|\1\n\2os.environ['TPU_LIBRARY_PATH'] = '${JAX_LIBTPU}'|" /usr/local/lib/python3.8/site-packages/jax/_src/cloud_tpu_init.py
41+
42+
# Packages needed by the Notebook editor:
43+
RUN pip install papermill jupyterlab python-lsp-server[all] jupyterlab-lsp
44+
45+
# Additional useful packages should be added here:
46+
RUN pip install pandas
4047

4148
# Set these env vars so that they don't produce errs calling the metadata server to load them:
4249
ENV TPU_ACCELERATOR_TYPE=v3-8
43-
ENV TPU_PROCESS_ADDRESSES=local
50+
ENV TPU_PROCESS_ADDRESSES=local
51+
52+
# Metadata
53+
ARG GIT_COMMIT=unknown
54+
ARG BUILD_DATE=unknown
55+
56+
LABEL git-commit=$GIT_COMMIT
57+
LABEL build-date=$BUILD_DATE
58+
ENV GIT_COMMIT=${GIT_COMMIT}
59+
ENV BUILD_DATE=${BUILD_DATE}
60+
61+
LABEL tensorflow-version=$TENSORFLOW_VERSION
62+
LABEL kaggle-lang=python
63+
64+
# Correlate current release with the git hash inside the kernel editor by running `!cat /etc/git_commit`.
65+
RUN echo "$GIT_COMMIT" > /etc/git_commit && echo "$BUILD_DATE" > /etc/build_date

tpu/config.txt

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,2 @@
1-
# TODO(b/213335159): Use ci-pretest for BASE_IMAGE_TAG once stable.
2-
BASE_IMAGE_TAG=v115
3-
TENSORFLOW_VERSION=2.8.0
1+
TENSORFLOW_VERSION=2.9.1
42
TORCH_VERSION=1.11.0

0 commit comments

Comments
 (0)