Fix torch tune, keras, tensorflow tests (#1489)

calderjo · web-flow · commit f6db354afe8a · 2025-06-26T17:45:37.000-07:00
Looks like torch tune changed the output of the --help command, this
cause issues with our smoke tests.
Keras, along with other package had issues with existing issues with
cudnn downgrading due to torch requirements, we pinned relevant tests.
diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl
@@ -35,7 +35,10 @@ RUN uv pip install --no-build-isolation --system "git+https://github.com/Kaggle/
 # b/408281617: Torch is adamant that it can not install cudnn 9.3.x, only 9.1.x, but Tensorflow can only support 9.3.x.
 # This conflict causes a number of package downgrades, which are handled in this command
 RUN uv pip install --system --force-reinstall --extra-index-url https://pypi.nvidia.com "cuml-cu12==25.2.1" \
-    "nvidia-cudnn-cu12==9.3.0.75"
+    "nvidia-cudnn-cu12==9.3.0.75" "nvidia-cublas-cu12==12.5.3.2" "nvidia-cusolver-cu12==11.6.3.83" \
+    "nvidia-cuda-cupti-cu12==12.5.82" "nvidia-cuda-nvrtc-cu12==12.5.82" "nvidia-cuda-runtime-cu12==12.5.82" \
+    "nvidia-cufft-cu12==11.2.3.61" "nvidia-curand-cu12==10.3.6.82" "nvidia-cusparse-cu12==12.5.1.3" \
+    "nvidia-nvjitlink-cu12==12.5.82"
 RUN uv pip install --system --force-reinstall "pynvjitlink-cu12==0.5.2"
 
 # b/385145217 Latest Colab lacks mkl numpy, install it.
@@ -46,7 +49,7 @@ RUN uv pip install --system "tbb>=2022" "libpysal==4.9.2"
 
 # b/404590350: Ray and torchtune have conflicting tune cli, we will prioritize torchtune.
 # b/415358158: Gensim removed from Colab image to upgrade scipy
-RUN uv pip install --system --force-reinstall --no-deps torchtune gensim
+RUN uv pip install --system --force-reinstall --no-deps torchtune gensim "scipy<=1.15.3"
 
 # Adding non-package dependencies:
 ADD clean-layer.sh  /tmp/clean-layer.sh
diff --git a/kaggle_requirements.txt b/kaggle_requirements.txt
@@ -121,19 +121,18 @@ qtconsole
 ray
 rgf-python
 s3fs
+# b/302136621: Fix eli5 import for learntools
 scikit-learn==1.2.2
 # Scikit-learn accelerated library for x86
 scikit-learn-intelex>=2023.0.1
 scikit-multilearn
 scikit-optimize
 scikit-plot
 scikit-surprise
-# b/415358158: Gensim removed from Colab image to upgrade scipy to 1.14.1
-scipy==1.15.1
 # Also pinning seaborn for learntools
 seaborn==0.12.2
 git+https://github.com/facebookresearch/segment-anything.git
-# b/329869023 shap 0.45.0 breaks learntools
+# b/329869023: shap 0.45.0 breaks learntools
 shap==0.44.1
 squarify
 tensorflow-cloud
diff --git a/tests/test_torchtune.py b/tests/test_torchtune.py
@@ -3,8 +3,14 @@
 
 class TestTorchtune(unittest.TestCase):
     def test_help(self):
-        result = subprocess.run(["tune", "--help"], stdout=subprocess.PIPE)
+        result = subprocess.run(
+            ["tune", "--help"], 
+            capture_output=True,
+            text=True
+        )
 
         self.assertEqual(0, result.returncode)
-        self.assertIsNone(result.stderr)
-        self.assertIn("Download a model from the Hugging Face Hub or Kaggle Model Hub.", result.stdout.decode("utf-8"))
+        self.assertIn(
+            "Download a model from the Hugging Face Hub or Kaggle", 
+            result.stdout
+        )