Skip to content

Commit cdba7d1

Browse files
authored
Merge pull request #1132 from Kaggle/tpu-1vm-image
Get tensorflow, jax, and pytorch working on TPU1VM
2 parents 389cecd + ba96747 commit cdba7d1

File tree

9 files changed

+158
-9
lines changed

9 files changed

+158
-9
lines changed

Jenkinsfile

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,22 @@ pipeline {
5656
'''
5757
}
5858
}
59+
stage('tensorflow TPU') {
60+
options {
61+
timeout(time: 180, unit: 'MINUTES')
62+
}
63+
steps {
64+
sh '''#!/bin/bash
65+
set -exo pipefail
66+
source tpu/config.txt
67+
cd packages/
68+
./build_package --base-image gcr.io/kaggle-images/python:${BASE_IMAGE_TAG} \
69+
--package tpu-tensorflow \
70+
--version $TENSORFLOW_VERSION \
71+
--push
72+
'''
73+
}
74+
}
5975
}
6076
}
6177
stage('Build/Test/Diff') {
@@ -150,7 +166,34 @@ pipeline {
150166
}
151167
}
152168
}
153-
}
169+
}
170+
stage('TPU VM') {
171+
stages {
172+
stage('Build Tensorflow TPU Image') {
173+
options {
174+
timeout(time: 20, unit: 'MINUTES')
175+
}
176+
steps {
177+
sh '''#!/bin/bash
178+
set -exo pipefail
179+
180+
./tpu/build | ts
181+
./push --tpu ${PRETEST_TAG}
182+
'''
183+
}
184+
}
185+
stage('Diff TPU VM Image') {
186+
steps {
187+
sh '''#!/bin/bash
188+
set -exo pipefail
189+
190+
docker pull gcr.io/kaggle-private-byod/python-tpuvm:${PRETEST_TAG}
191+
./diff --tpu --target gcr.io/kaggle-private-byod/python-tpuvm:${PRETEST_TAG}
192+
'''
193+
}
194+
}
195+
}
196+
}
154197
}
155198
}
156199

@@ -161,6 +204,7 @@ pipeline {
161204
162205
gcloud container images add-tag gcr.io/kaggle-images/python:${PRETEST_TAG} gcr.io/kaggle-images/python:${STAGING_TAG}
163206
gcloud container images add-tag gcr.io/kaggle-private-byod/python:${PRETEST_TAG} gcr.io/kaggle-private-byod/python:${STAGING_TAG}
207+
gcloud container images add-tag gcr.io/kaggle-private-byod/python-tpuvm:${PRETEST_TAG} gcr.io/kaggle-private-byod/python-tpuvm:${STAGING_TAG}
164208
'''
165209
}
166210
}

diff

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@ while :; do
3232
BASE_IMAGE_TAG='gcr.io/kaggle-private-byod/python:latest'
3333
TARGET_IMAGE_TAG='kaggle/python-gpu-build'
3434
;;
35+
-x|--tpu)
36+
BASE_IMAGE_TAG='gcr.io/kaggle-private-byod/python-tpuvm:latest'
37+
TARGET_IMAGE_TAG='kaggle/python-tpuvm-build'
38+
;;
3539
-b|--base)
3640
if [[ -z "$2" ]]; then
3741
usage

packages/build_package

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,8 @@ fi
117117

118118
# Keep only `tf2-gpu.2-6:m80` in `gcr.io/deeplearning-platform-release/tf2-gpu.2-6:m80`
119119
TAG=${BASE_IMAGE/gcr.io\/deeplearning-platform-release\//}
120+
# Keep only `python:v108` in `gcr.io/kaggle-images/python:v108`
121+
TAG=${TAG/gcr.io\/kaggle-images\//}
120122
# Replace the `:` in `tf2-gpu.2-6:m80` by `-`
121123
TAG=${TAG/:/-}
122124
# Append the package version

tpu/tensorflow.Dockerfile renamed to packages/tpu-tensorflow.Dockerfile

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1-
ARG BASE_IMAGE_TAG
1+
ARG BASE_IMAGE
22

3-
FROM gcr.io/kaggle-images/python:${BASE_IMAGE_TAG} AS builder
3+
FROM ${BASE_IMAGE} AS builder
4+
5+
ARG PACKAGE_VERSION
46

57
# Use Bazelisk to ensure the proper bazel version is used.
68
RUN cd /usr/local/src && \
@@ -12,12 +14,12 @@ RUN cd /usr/local/src && \
1214
RUN cd /usr/local/src && \
1315
git clone https://github.com/tensorflow/tensorflow && \
1416
cd tensorflow && \
15-
git checkout tags/v${TENSORFLOW_VERSION} && \
17+
git checkout tags/v${PACKAGE_VERSION} && \
1618
# TODO(rosbo): Is it really needed?
1719
pip install keras_applications --no-deps && \
1820
pip install keras_preprocessing --no-deps
1921

20-
# Create a TensorFlow wheel for CPU
22+
# Create a TensorFlow wheel for TPU
2123
RUN cd /usr/local/src/tensorflow && \
2224
cat /dev/null | ./configure && \
2325
bazel build \
@@ -32,7 +34,22 @@ RUN cd /usr/local/src/tensorflow && \
3234
RUN cd /usr/local/src/tensorflow && \
3335
bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg
3436

35-
# TODO(b/152075195): Will likely need to install custom build for TFA & tensorflow-gcs-config
37+
# Build TensorFlow addons library against TensorFlow CPU.
38+
#RUN cd /usr/local/src/ && \
39+
# git clone https://github.com/tensorflow/addons && \
40+
# cd addons && \
41+
# git checkout tags/v0.12.1 && \
42+
# python ./configure.py && \
43+
# bazel build --enable_runfiles build_pip_pkg && \
44+
# bazel-bin/build_pip_pkg /tmp/tfa_cpu && \
45+
# bazel clean
46+
47+
# Build tensorflow_gcs_config library against TensorFlow CPU.
48+
#ADD tensorflow-gcs-config /usr/local/src/tensorflow_gcs_config/
49+
#RUN cd /usr/local/src/tensorflow_gcs_config && \
50+
# apt-get install -y libcurl4-openssl-dev && \
51+
# python setup.py bdist_wheel -d /tmp/tensorflow_gcs_config && \
52+
# bazel clean
3653

3754
# Use multi-stage builds to minimize image output size.
3855
FROM alpine:latest

push

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ Push a newly-built image with the given LABEL to gcr.io and DockerHub.
88
99
Options:
1010
-g, --gpu Push the image with GPU support.
11+
-t, --tpu Push the image with GPU support.
1112
-s, --source-image IMAGE Tag for the source image.
1213
EOF
1314
}
@@ -26,6 +27,10 @@ while :; do
2627
SOURCE_IMAGE_TAG='kaggle/python-gpu-build:latest'
2728
TARGET_IMAGE='gcr.io/kaggle-private-byod/python'
2829
;;
30+
-t|--tpu)
31+
SOURCE_IMAGE_TAG='kaggle/python-tpuvm-build:latest'
32+
TARGET_IMAGE='gcr.io/kaggle-private-byod/python-tpuvm'
33+
;;
2934
-s|--source-image)
3035
if [[ -z $2 ]]; then
3136
usage

tests/common.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,4 @@
44
import unittest
55

66
gpu_test = unittest.skipIf(len(os.environ.get('CUDA_VERSION', '')) == 0, 'Not running GPU tests')
7+
tpu_test = unittest.skipIf(len(os.environ.get('ISTPUVM', '')) == 0, 'Not running TPU tests')

tpu/Dockerfile

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,27 @@
11
ARG BASE_IMAGE_TAG
22
ARG LIBTPU_IMAGE_TAG
3-
ARG TENSORFLOW_WHL_IMAGE_TAG
3+
ARG TENSORFLOW_VERSION
4+
ARG TORCH_VERSION
45

56
FROM gcr.io/cloud-tpu-v2-images/libtpu:${LIBTPU_IMAGE_TAG} as libtpu
6-
FROM gcr.io/kaggle-images/python-tpu-tensorflow-whl:${TENSORFLOW_WHL_IMAGE_TAG} AS tensorflow_whl
7+
FROM gcr.io/kaggle-images/python-tpu-tensorflow-whl:python-${BASE_IMAGE_TAG}-${TENSORFLOW_VERSION} AS tensorflow_whl
78
FROM gcr.io/kaggle-images/python:${BASE_IMAGE_TAG}
89

10+
ENV ISTPUVM=1
11+
912
COPY --from=libtpu /libtpu.so /lib
1013

1114
COPY --from=tensorflow_whl /tmp/tensorflow_pkg/tensorflow*.whl /tmp/tensorflow_pkg/
1215
RUN pip install /tmp/tensorflow_pkg/tensorflow*.whl && \
13-
rm -rf /tmp/tensorflow_pkg
16+
rm -rf /tmp/tensorflow_pkg && \
17+
/tmp/clean-layer.sh
18+
19+
# https://cloud.google.com/tpu/docs/pytorch-xla-ug-tpu-vm#changing_pytorch_version
20+
RUN pip uninstall -y torch && \
21+
pip install torch==${TORCH_VERSION} && \
22+
pip install torch_xla[tpuvm] -f https://storage.googleapis.com/tpu-pytorch/wheels/tpuvm/torch_xla-${TORCH_VERSION}-cp37-cp37m-linux_x86_64.whl && \
23+
/tmp/clean-layer.sh
24+
25+
# https://cloud.google.com/tpu/docs/jax-quickstart-tpu-vm#install_jax_on_your_cloud_tpu_vm
26+
RUN pip install "jax[tpu]>=0.2.16" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html && \
27+
/tmp/clean-layer.sh

tpu/build

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
#!/bin/bash
2+
set -e
3+
4+
usage() {
5+
cat << EOF
6+
Usage: $0 [OPTIONS]
7+
Build a new Python TPU 1VM Docker image.
8+
9+
Options:
10+
-c, --use-cache Use layer cache when building a new image.
11+
EOF
12+
}
13+
14+
CACHE_FLAG='--no-cache'
15+
DOCKERFILE='Dockerfile'
16+
IMAGE_TAG='kaggle/python-tpuvm-build'
17+
BUILD_ARGS=''
18+
19+
while :; do
20+
case "$1" in
21+
-h|--help)
22+
usage
23+
exit
24+
;;
25+
-c|--use-cache)
26+
CACHE_FLAG=''
27+
;;
28+
-?*)
29+
usage
30+
printf 'ERROR: Unknown option: %s\n' "$1" >&2
31+
exit
32+
;;
33+
*)
34+
break
35+
esac
36+
37+
shift
38+
done
39+
40+
BUILD_ARGS+=" --build-arg GIT_COMMIT=$(git rev-parse HEAD)"
41+
BUILD_ARGS+=" --build-arg BUILD_DATE=$(date '+%Y%m%d-%H%M%S')"
42+
43+
# Read build args from config.txt file.
44+
SRCDIR=$(dirname "${BASH_SOURCE[0]}")
45+
for l in `cat ${SRCDIR}/config.txt`; do
46+
BUILD_ARGS+=" --build-arg $l"
47+
done
48+
49+
readonly CACHE_FLAG
50+
readonly DOCKERFILE
51+
readonly IMAGE_TAG
52+
readonly BUILD_ARGS
53+
54+
DOCKERFILE_PATH="$SRCDIR/$DOCKERFILE"
55+
56+
set -x
57+
docker build --rm --pull $CACHE_FLAG -t "$IMAGE_TAG" -f "$DOCKERFILE_PATH" $BUILD_ARGS .

tpu/config.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# TODO(b/213335159): Use ci-pretest for BASE_IMAGE_TAG once stable.
2+
BASE_IMAGE_TAG=v108
3+
LIBTPU_IMAGE_TAG=libtpu_1.1.0_RC00
4+
TENSORFLOW_VERSION=2.8.0
5+
TORCH_VERSION=1.11.0

0 commit comments

Comments
 (0)