keras-team
diff --git a/‎keras/src/callbacks/memory_usage_callback.py
Lines changed: 70 additions & 84 deletions b/‎keras/src/callbacks/memory_usage_callback.py
Lines changed: 70 additions & 84 deletions
@@ -5,8 +5,6 @@
 from keras.src.callbacks.callback import Callback
 from keras.src import backend as K
 
-# Attempt to import psutil for CPU memory
-
 try:
     import psutil
 except ImportError:
@@ -15,83 +13,88 @@
 
 @keras_export("keras.callbacks.MemoryUsageCallback")
 class MemoryUsageCallback(Callback):
-    """
-    Monitors CPU and GPU memory across backends and logs to stdout and TensorBoard.
-
-    Example:
-    ```python
-    from keras.callbacks import MemoryUsageCallback
-    callback = MemoryUsageCallback(
-        monitor_gpu=True,
-        log_every_batch=False,
-        tensorboard_log_dir="./logs"
-    )
-    model.fit(..., callbacks=[callback])
-    ```
+    """Monitor CPU/GPU/TPU/OpenVINO memory during training.
+
+    Tracks:
+        - CPU memory via `psutil.Process().memory_info().rss`.
+        - GPU memory via backend APIs (TF, Torch, JAX, OpenVINO).
+        - Logs to stdout and, optionally, to TensorBoard.
 
     Args:
-        monitor_gpu (bool): Whether to log GPU memory. Defaults to True.
-        log_every_batch (bool): Whether to log after every batch. Defaults to False.
-        tensorboard_log_dir (str): Directory for TensorBoard logs; None disables. Defaults to None.
+        monitor_gpu: Bool. If True, query GPU/accelerator memory.
+        log_every_batch: Bool. If True, log after each batch.
+        tensorboard_log_dir: str or None. If set, use TF summary writer.
 
     Raises:
-        ImportError: If psutil is not installed.
+        ImportError: If `psutil` is missing.
     """
 
     def __init__(
-        self,
-        monitor_gpu=True,
-        log_every_batch=False,
-        tensorboard_log_dir=None,
+        self, monitor_gpu=True, log_every_batch=False, tensorboard_log_dir=None
     ):
         super().__init__()
         if psutil is None:
-            raise ImportError(
-                "MemoryUsageCallback requires `psutil`; install via `pip install psutil`."
-            )
+            raise ImportError("MemoryUsageCallback requires the 'psutil' library.")
         self.monitor_gpu = monitor_gpu
         self.log_every_batch = log_every_batch
         self.process = psutil.Process()
         self.tb_writer = None
-        self._batch_count = 0
+        self._batches_seen = 0
 
         if tensorboard_log_dir:
             try:
-                import tensorflow as tf
+                import tensorflow as tf 
 
                 logdir = os.path.expanduser(tensorboard_log_dir)
                 self.tb_writer = tf.summary.create_file_writer(logdir)
-            except ImportError as e:
-                warnings.warn(f"TensorBoard disabled (no TF): {e}", RuntimeWarning)
             except Exception as e:
-                warnings.warn(
-                    f"Failed to init TB writer at {tensorboard_log_dir}: {e}",
-                    RuntimeWarning,
-                )
+                warnings.warn(f"TB init error: {e}", RuntimeWarning)
+
+    def on_train_begin(self, logs=None):
+        self._batches_seen = 0
+
+    def on_epoch_begin(self, epoch, logs=None):
+        cpu = self._cpu_mem_mb()
+        gpu = self._get_gpu_memory()
+        self._log("Epoch %d start" % epoch, epoch, cpu, gpu)
+
+    def on_epoch_end(self, epoch, logs=None):
+        cpu = self._cpu_mem_mb()
+        gpu = self._get_gpu_memory()
+        self._log("Epoch %d end" % epoch, epoch + 1, cpu, gpu)
+
+    def on_batch_end(self, batch, logs=None):
+        if self.log_every_batch:
+            cpu = self._cpu_mem_mb()
+            gpu = self._get_gpu_memory()
+            self._log(f"Batch {self._batches_seen} end", self._batches_seen, cpu, gpu)
+        self._batches_seen += 1
+
+    def on_train_end(self, logs=None):
+        if self.tb_writer:
+            self.tb_writer.close()
 
-    def _get_cpu_memory(self):
-        """Return resident set size in MB."""
+    def _cpu_mem_mb(self):
         return self.process.memory_info().rss / (1024**2)
 
     def _get_gpu_memory(self):
-        """Return GPU memory usage in MB or None."""
         if not self.monitor_gpu:
             return None
         backend = K.backend()
         try:
             if backend == "tensorflow":
-                import tensorflow as tf
+                import tensorflow as tf  
 
                 gpus = tf.config.list_physical_devices("GPU")
                 if not gpus:
                     return None
                 total = 0
-                for gpu in gpus:
-                    info = tf.config.experimental.get_memory_info(gpu.name)
+                for g in gpus:
+                    info = tf.config.experimental.get_memory_info(g.name)
                     total += info.get("current", 0)
                 return total / (1024**2)
             if backend == "torch":
-                import torch
+                import torch  
 
                 if not torch.cuda.is_available():
                     return None
@@ -101,63 +104,46 @@ def _get_gpu_memory(self):
                 )
                 return total / (1024**2)
             if backend == "jax":
-                import jax
+                import jax  
 
-                devs = [d for d in jax.devices() if d.platform == "gpu"]
+                devs = [d for d in jax.devices() if d.platform.upper() == "GPU"]
                 if not devs:
                     return None
                 total = 0
                 for d in devs:
-                    stats = getattr(d, "memory_stats", lambda: {})()
-                    total += stats.get("bytes_in_use", stats.get("allocated_bytes", 0))
+                    stats = d.memory_stats()
+                    total += stats.get("bytes_in_use", 0)
                 return total / (1024**2)
-            if not hasattr(self, "_warned_backend"):
-                warnings.warn(
-                    f"Backend '{backend}' not supported for GPU memory.",
-                    RuntimeWarning,
-                )
-                self._warned_backend = True
-            return None
+            if backend == "openvino":
+                try:
+                    import openvino as ov  
+
+                    core = ov.Core()
+                    devices = core.available_devices
+                    total = 0
+                    for dev in devices:
+                        stats = core.get_property(dev, "DEVICE_MEMORY_STATISTICS")
+                        total += stats.get("deviceUsedBytes", 0)
+                    return total / (1024**2)
+                except Exception as e:
+                    warnings.warn(f"OVINO mem err: {e}", RuntimeWarning)
+                    return None
         except ImportError as e:
-            warnings.warn(
-                f"Could not import backend lib ({e}); GPU disabled.",
-                RuntimeWarning,
-            )
-            return None
-        except Exception as e:
-            warnings.warn(f"Error retrieving GPU memory ({e}).", RuntimeWarning)
+            warnings.warn(f"Import err for {backend}: {e}", RuntimeWarning)
             return None
+        warnings.warn(f"Unsupported backend '{backend}'", RuntimeWarning)
+        return None
 
-    def _log(self, label, step):
-        cpu = self._get_cpu_memory()
-        gpu = self._get_gpu_memory()
-        msg = f"{label} - CPU Memory: {cpu:.2f} MB"
+    def _log(self, label, step, cpu, gpu):
+        msg = f"{label} - CPU: {cpu:.2f} MB"
         if gpu is not None:
-            msg += f"; GPU Memory: {gpu:.2f} MB"
+            msg += f"; GPU: {gpu:.2f} MB"
         print(msg)
         if self.tb_writer:
-            import tensorflow as tf
+            import tensorflow as tf  
 
-            with self.tb_writer.as_default(step=int(step)):
+            with self.tb_writer.as_default(step=step):
                 tf.summary.scalar("Memory/CPU_MB", cpu)
                 if gpu is not None:
                     tf.summary.scalar("Memory/GPU_MB", gpu)
             self.tb_writer.flush()
-
-    def on_train_begin(self, logs=None):
-        self._batch_count = 0
-
-    def on_epoch_begin(self, epoch, logs=None):
-        self._log(f"Epoch {epoch} start", epoch)
-
-    def on_epoch_end(self, epoch, logs=None):
-        self._log(f"Epoch {epoch} end", epoch + 1)
-
-    def on_batch_end(self, batch, logs=None):
-        if self.log_every_batch:
-            self._log(f"Batch {self._batch_count} end", self._batch_count)
-        self._batch_count += 1
-
-    def on_train_end(self, logs=None):
-        if self.tb_writer:
-            self.tb_writer.close()