Update Colab Base image to colab_20250219-060225_RC01 (#1475)

calderjo · web-flow · commit f2cdc779f33d · 2025-04-04T12:50:29.000-07:00
We are upgrading the base image to the latest release image by colab:
colab_20250219-060225_RC01

Which includes the following upgrades:
TF 2.18
Python 3.11
Cuda 12.5

This PR includes a handful of fixes to resolve conflicts related to
these upgrade. Notably issues pertaining torch and cudnn.

We also bumped lightgbm version as well

We also included a fix to tune cli package conflict.
diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl
@@ -31,29 +31,41 @@ RUN uv pip uninstall --system google-cloud-bigquery-storage
 # b/394382016: sigstore (dependency of kagglehub) requires a prerelease packages, installing separate.
 RUN uv pip install --system --force-reinstall --prerelease=allow kagglehub[pandas-datasets,hf-datasets,signing]>=0.3.9
 
+# b/408284143: google-cloud-automl 2.0.0 introduced incompatible API changes, need to pin to 1.0.1
+
+# b/408284435: Keras 3.6 broke test_keras.py > test_train > keras.datasets.mnist.load_data()
+# See https://github.com/keras-team/keras/commit/dcefb139863505d166dd1325066f329b3033d45a
+# Colab base is on Keras 3.8, we have to install the package separately
+RUN uv pip install --system google-cloud-automl==1.0.1 google-cloud-aiplatform google-cloud-translate==3.12.1 \
+    google-cloud-videointelligence google-cloud-vision google-genai "keras<3.6"
+
 # uv cannot install this in requirements.txt without --no-build-isolation
 # to avoid affecting the larger build, we'll post-install it.
 RUN uv pip install --no-build-isolation --system "git+https://github.com/Kaggle/learntools"
 
-# b/385161357 Latest Colab uses tf 2.17.1, but tf decision forests only has a version for 2.17.0.
-# Instead, we'll install tfdf with its deps and hope that 2.17.0 compat tfdf works with tf 2.17.1.
-RUN uv pip install --system --no-deps tensorflow-decision-forests==1.10.0 wurlitzer==3.1.1 ydf==0.9.0
+# b/408281617: Torch is adamant that it can not install cudnn 9.3.x, only 9.1.x, but Tensorflow can only support 9.3.x.
+# This conflict causes a number of package downgrades, which are handled in this command
+RUN uv pip install --system --force-reinstall --extra-index-url https://pypi.nvidia.com pynvjitlink-cu12 cuml-cu12==25.2.1 \
+    nvidia-cudnn-cu12==9.3.0.75 scipy tsfresh
+RUN uv pip install --system --force-reinstall pynvjitlink-cu12==0.5.2
 
 # b/385145217 Latest Colab lacks mkl numpy, install it.
 RUN uv pip install --system --force-reinstall -i https://pypi.anaconda.org/intel/simple numpy
 
-# b/328788268 We install an incompatible pair of libs (shapely<2, libpysal==4.9.2) so we can't put this one in the requirements.txt
 # newer daal4py requires tbb>=2022, but libpysal is downgrading it for some reason
 RUN uv pip install --system "tbb>=2022" "libpysal==4.9.2"
 
+# b/404590350: Ray and torchtune have conflicting tune cli, we will prioritize torchtune.
+RUN uv pip install --system --force-reinstall --no-deps torchtune
+
 # Adding non-package dependencies:
 
 ADD clean-layer.sh  /tmp/clean-layer.sh
 ADD patches/nbconvert-extensions.tpl /opt/kaggle/nbconvert-extensions.tpl
 ADD patches/template_conf.json /opt/kaggle/conf.json
 
-# /opt/conda/lib/python3.10/site-packages
-ARG PACKAGE_PATH=/usr/local/lib/python3.10/dist-packages
+# /opt/conda/lib/python3.11/site-packages
+ARG PACKAGE_PATH=/usr/local/lib/python3.11/dist-packages
 
 # Install GPU-specific non-pip packages.
 {{ if eq .Accelerator "gpu" }}
@@ -108,6 +120,9 @@ RUN apt-get install -y libfreetype6-dev && \
     apt-get install -y libglib2.0-0 libxext6 libsm6 libxrender1 libfontconfig1 --fix-missing
 
 # NLTK Project datasets
+# b/408298750: We currently reinstall the package, because we get the following error:
+# `AttributeError: module 'inspect' has no attribute 'formatargspec'. Did you mean: 'formatargvalues'?`
+RUN uv pip install --system --force-reinstall "nltk>=3.9.1"
 RUN mkdir -p /usr/share/nltk_data && \
     # NLTK Downloader no longer continues smoothly after an error, so we explicitly list
     # the corpuses that work
@@ -120,7 +135,7 @@ RUN mkdir -p /usr/share/nltk_data && \
     masc_tagged maxent_ne_chunker maxent_treebank_pos_tagger moses_sample movie_reviews \
     mte_teip5 names nps_chat omw opinion_lexicon paradigms \
     pil pl196x porter_test ppattach problem_reports product_reviews_1 product_reviews_2 propbank \
-    pros_cons ptb punkt qc reuters rslp rte sample_grammars semcor senseval sentence_polarity \
+    pros_cons ptb punkt punkt_tab qc reuters rslp rte sample_grammars semcor senseval sentence_polarity \
     sentiwordnet shakespeare sinica_treebank smultron snowball_data spanish_grammars \
     state_union stopwords subjectivity swadesh switchboard tagsets timit toolbox treebank \
     twitter_samples udhr2 udhr unicode_samples universal_tagset universal_treebanks_v20 \
@@ -198,7 +213,7 @@ ADD patches/kaggle_gcp.py \
 
 # Figure out why this is in a different place?
 # Found by doing a export PYTHONVERBOSE=1 and then running python and checking for where it looked for it.
-ADD patches/sitecustomize.py /usr/lib/python3.10/sitecustomize.py
+ADD patches/sitecustomize.py /usr/lib/python3.11/sitecustomize.py
 
 ARG GIT_COMMIT=unknown \
     BUILD_DATE=unknown
diff --git a/config.txt b/config.txt
@@ -1,5 +1,5 @@
 BASE_IMAGE=us-docker.pkg.dev/colab-images/public/runtime
-BASE_IMAGE_TAG=release-colab_20241217-060132_RC00
-LIGHTGBM_VERSION=4.5.0
+BASE_IMAGE_TAG=release-colab_20250219-060225_RC01
+LIGHTGBM_VERSION=4.6.0
 CUDA_MAJOR_VERSION=12
-CUDA_MINOR_VERSION=2
+CUDA_MINOR_VERSION=5
diff --git a/kaggle_requirements.txt b/kaggle_requirements.txt
@@ -1,5 +1,4 @@
 # Please keep this in alphabetical order
---extra-index-url https://pypi.nvidia.com
 Altair>=5.4.0
 Babel
 Boruta
@@ -24,7 +23,6 @@ catboost
 category-encoders
 cesium
 comm
-cuml-cu12
 cytoolz
 dask-expr
 # Older versions of datasets fail with "Loading a dataset cached in a LocalFileSystem is not supported"
@@ -46,14 +44,6 @@ fuzzywuzzy
 geojson
 # geopandas > v0.14.4 breaks learn tools
 geopandas==v0.14.4
-google-cloud-aiplatform
-# google-cloud-automl 2.0.0 introduced incompatible API changes, need to pin to 1.0.1
-google-cloud-automl==1.0.1
-# b/315753846: Unpin translate package.
-google-cloud-translate==3.12.1
-google-cloud-videointelligence
-google-cloud-vision
-google-genai
 gpxpy
 h2o
 haversine
@@ -70,15 +60,11 @@ jupyter_server==2.12.5
 jupyterlab
 jupyterlab-lsp
 kaggle-environments
-# Keras 3.6 broke test_keras.py > test_train > keras.datasets.mnist.load_data():
-# See https://github.com/keras-team/keras/commit/dcefb139863505d166dd1325066f329b3033d45a
-keras<3.6
 keras-cv
 keras-nlp
 keras-tuner
 kornia
 langid
-leven
 # b/328788268: libpysal 4.10 seems to fail with "module 'shapely' has no attribute 'Geometry'. Did you mean: 'geometry'"
 libpysal<=4.9.2
 lime
@@ -142,12 +128,13 @@ squarify
 tensorflow-cloud
 tensorflow-io
 tensorflow-text
-# b/385161357: tf 2.17.1 does not have matching tensorflow_decision_forests release
-# tensorflow_decision_forests
+tensorflow_decision_forests
 timm
+torchao
 torchinfo
 torchmetrics
 torchtune
+triton
 tsfresh
 vtk
 wandb
diff --git a/tests/test_torchtune.py b/tests/test_torchtune.py
@@ -1,9 +1,10 @@
 import unittest
-
 import subprocess
 
 class TestTorchtune(unittest.TestCase):
     def test_help(self):
-        ret_code = subprocess.run(["tune", "--help"])
-        self.assertEqual(0, ret_code.returncode)
-        self.assertIsNone(ret_code.stderr)
+        result = subprocess.run(["tune", "--help"], stdout=subprocess.PIPE)
+
+        self.assertEqual(0, result.returncode)
+        self.assertIsNone(result.stderr)
+        self.assertIn("Download a model from the Hugging Face Hub or Kaggle Model Hub.", result.stdout.decode("utf-8"))