mlx - remat and continued test updates (#21244)

acsweet · web-flow · commit 93d52b05ff51 · 2025-05-03T22:08:18.000-07:00
* all tests excl remaining linalg ops passing on Apple silicon

* comment message
diff --git a/keras/src/backend/mlx/core.py b/keras/src/backend/mlx/core.py
@@ -21,7 +21,8 @@
 
 SUPPORTS_SPARSE_TENSORS = False
 SUPPORTS_RAGGED_TENSORS = False
-IS_THREAD_SAFE = True
+# TODO: follow updates and adjust to thread safe when possible
+IS_THREAD_SAFE = False  # False as of mlx 0.24.0
 
 MLX_DTYPES = {
     "float16": mx.float16,
@@ -596,6 +597,18 @@ def __call__(self, *args, **kwargs):
         return outputs
 
 
+def remat(f):
+    """Implementation of rematerialization.
+
+    Args:
+        f: The function or operation to rematerialize.
+    Returns:
+        A function wrapping f that defines a custom gradient, which
+        recomputes f on the backwards pass of a gradient call.
+    """
+    return mx.checkpoint(f)
+
+
 def enable_float64():
     """Returns context manager forcing operations on cpu
 
diff --git a/keras/src/dtype_policies/dtype_policy_map_test.py b/keras/src/dtype_policies/dtype_policy_map_test.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pytest
 
+from keras.src import backend
 from keras.src import dtype_policies
 from keras.src import layers
 from keras.src import models
@@ -23,6 +24,9 @@ def tearDown(self):
 
     @pytest.mark.requires_trainable_backend
     def test_basic_usage(self):
+        if backend.backend() == "mlx":
+            self.skipTest("mlx backend does not yet support quantization")
+
         # Create a subclass that might contain mixing dtype policies for
         # sublayers.
         # It is important to ensure that `dtype` is passed to sublayers and
diff --git a/keras/src/initializers/constant_initializers_test.py b/keras/src/initializers/constant_initializers_test.py
@@ -77,6 +77,11 @@ def test_identity_initializer(self):
 
     @skip_if_backend("openvino", "openvino backend does not support `arange`")
     def test_stft_initializer(self):
+        if backend.backend() == "mlx":
+            # for mlx backend force on to cpu for float64
+            self.mlx_cpu_context = backend.core.enable_float64()
+            self.mlx_cpu_context.__enter__()
+
         shape = (256, 1, 513)
         time_range = np.arange(256).reshape((-1, 1, 1))
         freq_range = (np.arange(513) / 1024.0).reshape((1, 1, -1))
@@ -142,3 +147,6 @@ def test_stft_initializer(self):
         # Test compatible class_name
         initializer = initializers.get("STFTInitializer")
         self.assertIsInstance(initializer, initializers.STFT)
+
+        if backend.backend() == "mlx":
+            self.mlx_cpu_context.__exit__(None, None, None)
diff --git a/keras/src/layers/attention/grouped_query_attention_test.py b/keras/src/layers/attention/grouped_query_attention_test.py
@@ -287,8 +287,9 @@ def test_masking(self, use_causal_mask):
             mask = mask & np.array(
                 [[[1, 0, 0], [1, 1, 0]] + [[1, 1, 1]] * 3]
             ).astype(bool)
-        del masked_query._keras_mask
-        del masked_value._keras_mask
+        if backend.backend() != "mlx":
+            del masked_query._keras_mask
+            del masked_value._keras_mask
         output_with_manual_mask = layer(
             query=masked_query, value=masked_value, attention_mask=mask
         )
diff --git a/keras/src/layers/attention/multi_head_attention_test.py b/keras/src/layers/attention/multi_head_attention_test.py
@@ -356,8 +356,9 @@ def test_masking(self, use_causal_mask):
         )
         if use_causal_mask:
             mask = mask & np.array([[[1, 0, 0], [1, 1, 0]] + [[1, 1, 1]] * 3])
-        del masked_query._keras_mask
-        del masked_value._keras_mask
+        if backend.backend() != "mlx":
+            del masked_query._keras_mask
+            del masked_value._keras_mask
         output_with_manual_mask = layer(
             query=masked_query, value=masked_value, attention_mask=mask
         )
diff --git a/keras/src/layers/layer_test.py b/keras/src/layers/layer_test.py
@@ -199,6 +199,8 @@ def call(self, x):
 
     def test_quantized_layer_with_remat(self):
         """Test rematerialization on a quantized layer."""
+        if backend.backend() == "mlx":
+            self.skipTest("float8 is not yet supported in mlx backend.")
         with patch(
             "keras.src.backend.common.remat.remat", wraps=remat.remat
         ) as mock_remat:
diff --git a/keras/src/models/model_test.py b/keras/src/models/model_test.py
@@ -1238,8 +1238,8 @@ def test_export_error(self):
             with self.assertRaisesRegex(
                 NotImplementedError,
                 (
-                    r"`export_saved_model` only currently supports the "
-                    r"tensorflow, jax and torch backends."
+                    r"`ExportArchive` is only compatible with TensorFlow, "
+                    r"JAX and Torch backends."
                 ),
             ):
                 model.export(temp_filepath, format="tf_saved_model")
diff --git a/keras/src/ops/linalg_test.py b/keras/src/ops/linalg_test.py
@@ -198,6 +198,11 @@ def test_cholesky(self):
         out = linalg.cholesky(x)
         self.assertEqual(out.shape, (4, 3, 3))
 
+        if backend.backend() == "mlx":
+            # mlx backend currently cannot mimic numpy ValueError
+            # for bad Cholesky decomp, e.g. if matrix not pos semi-def
+            return
+
         x = KerasTensor([10, 20, 15])
         with self.assertRaises(ValueError):
             linalg.cholesky(x)
@@ -340,8 +345,11 @@ def test_svd(self):
 class LinalgOpsCorrectnessTest(testing.TestCase):
     def test_cholesky(self):
         x = np.random.rand(4, 3, 3).astype("float32")
-        with self.assertRaises(ValueError):
-            linalg.cholesky(x)
+        if backend.backend() != "mlx":
+            # mlx backend currently cannot mimic numpy ValueError
+            # for bad Cholesky decomp, e.g. if matrix not pos semi-def
+            with self.assertRaises(ValueError):
+                linalg.cholesky(x)
         x_psd = x @ x.transpose((0, 2, 1)) + 1e-5 * np.eye(3)
         out = linalg.cholesky(x_psd)
         self.assertAllClose(out, np.linalg.cholesky(x_psd), atol=1e-4)
diff --git a/keras/src/ops/nn_test.py b/keras/src/ops/nn_test.py
diff --git a/keras/src/quantizers/quantizers_test.py b/keras/src/quantizers/quantizers_test.py

Original file line number	Diff line number	Diff line change
`@@ -356,8 +356,9 @@ def test_masking(self, use_causal_mask):`
`356`	`356`	`)`
`357`	`357`	`if use_causal_mask:`
`358`	`358`	`mask = mask & np.array([[[1, 0, 0], [1, 1, 0]] + [[1, 1, 1]] * 3])`
`359`		`- del masked_query._keras_mask`
`360`		`- del masked_value._keras_mask`
	`359`	`+ if backend.backend() != "mlx":`
	`360`	`+ del masked_query._keras_mask`
	`361`	`+ del masked_value._keras_mask`
`361`	`362`	`output_with_manual_mask = layer(`
`362`	`363`	`query=masked_query, value=masked_value, attention_mask=mask`
`363`	`364`	`)`