Skip to content

Commit 105cbdc

Browse files
Add openvino support
1 parent daddf29 commit 105cbdc

File tree

2 files changed

+161
-162
lines changed

2 files changed

+161
-162
lines changed

keras/src/callbacks/memory_usage_callback.py

Lines changed: 70 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,6 @@
55
from keras.src.callbacks.callback import Callback
66
from keras.src import backend as K
77

8-
# Attempt to import psutil for CPU memory
9-
108
try:
119
import psutil
1210
except ImportError:
@@ -15,83 +13,88 @@
1513

1614
@keras_export("keras.callbacks.MemoryUsageCallback")
1715
class MemoryUsageCallback(Callback):
18-
"""
19-
Monitors CPU and GPU memory across backends and logs to stdout and TensorBoard.
20-
21-
Example:
22-
```python
23-
from keras.callbacks import MemoryUsageCallback
24-
callback = MemoryUsageCallback(
25-
monitor_gpu=True,
26-
log_every_batch=False,
27-
tensorboard_log_dir="./logs"
28-
)
29-
model.fit(..., callbacks=[callback])
30-
```
16+
"""Monitor CPU/GPU/TPU/OpenVINO memory during training.
17+
18+
Tracks:
19+
- CPU memory via `psutil.Process().memory_info().rss`.
20+
- GPU memory via backend APIs (TF, Torch, JAX, OpenVINO).
21+
- Logs to stdout and, optionally, to TensorBoard.
3122
3223
Args:
33-
monitor_gpu (bool): Whether to log GPU memory. Defaults to True.
34-
log_every_batch (bool): Whether to log after every batch. Defaults to False.
35-
tensorboard_log_dir (str): Directory for TensorBoard logs; None disables. Defaults to None.
24+
monitor_gpu: Bool. If True, query GPU/accelerator memory.
25+
log_every_batch: Bool. If True, log after each batch.
26+
tensorboard_log_dir: str or None. If set, use TF summary writer.
3627
3728
Raises:
38-
ImportError: If psutil is not installed.
29+
ImportError: If `psutil` is missing.
3930
"""
4031

4132
def __init__(
42-
self,
43-
monitor_gpu=True,
44-
log_every_batch=False,
45-
tensorboard_log_dir=None,
33+
self, monitor_gpu=True, log_every_batch=False, tensorboard_log_dir=None
4634
):
4735
super().__init__()
4836
if psutil is None:
49-
raise ImportError(
50-
"MemoryUsageCallback requires `psutil`; install via `pip install psutil`."
51-
)
37+
raise ImportError("MemoryUsageCallback requires the 'psutil' library.")
5238
self.monitor_gpu = monitor_gpu
5339
self.log_every_batch = log_every_batch
5440
self.process = psutil.Process()
5541
self.tb_writer = None
56-
self._batch_count = 0
42+
self._batches_seen = 0
5743

5844
if tensorboard_log_dir:
5945
try:
60-
import tensorflow as tf
46+
import tensorflow as tf
6147

6248
logdir = os.path.expanduser(tensorboard_log_dir)
6349
self.tb_writer = tf.summary.create_file_writer(logdir)
64-
except ImportError as e:
65-
warnings.warn(f"TensorBoard disabled (no TF): {e}", RuntimeWarning)
6650
except Exception as e:
67-
warnings.warn(
68-
f"Failed to init TB writer at {tensorboard_log_dir}: {e}",
69-
RuntimeWarning,
70-
)
51+
warnings.warn(f"TB init error: {e}", RuntimeWarning)
52+
53+
def on_train_begin(self, logs=None):
54+
self._batches_seen = 0
55+
56+
def on_epoch_begin(self, epoch, logs=None):
57+
cpu = self._cpu_mem_mb()
58+
gpu = self._get_gpu_memory()
59+
self._log("Epoch %d start" % epoch, epoch, cpu, gpu)
60+
61+
def on_epoch_end(self, epoch, logs=None):
62+
cpu = self._cpu_mem_mb()
63+
gpu = self._get_gpu_memory()
64+
self._log("Epoch %d end" % epoch, epoch + 1, cpu, gpu)
65+
66+
def on_batch_end(self, batch, logs=None):
67+
if self.log_every_batch:
68+
cpu = self._cpu_mem_mb()
69+
gpu = self._get_gpu_memory()
70+
self._log(f"Batch {self._batches_seen} end", self._batches_seen, cpu, gpu)
71+
self._batches_seen += 1
72+
73+
def on_train_end(self, logs=None):
74+
if self.tb_writer:
75+
self.tb_writer.close()
7176

72-
def _get_cpu_memory(self):
73-
"""Return resident set size in MB."""
77+
def _cpu_mem_mb(self):
7478
return self.process.memory_info().rss / (1024**2)
7579

7680
def _get_gpu_memory(self):
77-
"""Return GPU memory usage in MB or None."""
7881
if not self.monitor_gpu:
7982
return None
8083
backend = K.backend()
8184
try:
8285
if backend == "tensorflow":
83-
import tensorflow as tf
86+
import tensorflow as tf
8487

8588
gpus = tf.config.list_physical_devices("GPU")
8689
if not gpus:
8790
return None
8891
total = 0
89-
for gpu in gpus:
90-
info = tf.config.experimental.get_memory_info(gpu.name)
92+
for g in gpus:
93+
info = tf.config.experimental.get_memory_info(g.name)
9194
total += info.get("current", 0)
9295
return total / (1024**2)
9396
if backend == "torch":
94-
import torch
97+
import torch
9598

9699
if not torch.cuda.is_available():
97100
return None
@@ -101,63 +104,46 @@ def _get_gpu_memory(self):
101104
)
102105
return total / (1024**2)
103106
if backend == "jax":
104-
import jax
107+
import jax
105108

106-
devs = [d for d in jax.devices() if d.platform == "gpu"]
109+
devs = [d for d in jax.devices() if d.platform.upper() == "GPU"]
107110
if not devs:
108111
return None
109112
total = 0
110113
for d in devs:
111-
stats = getattr(d, "memory_stats", lambda: {})()
112-
total += stats.get("bytes_in_use", stats.get("allocated_bytes", 0))
114+
stats = d.memory_stats()
115+
total += stats.get("bytes_in_use", 0)
113116
return total / (1024**2)
114-
if not hasattr(self, "_warned_backend"):
115-
warnings.warn(
116-
f"Backend '{backend}' not supported for GPU memory.",
117-
RuntimeWarning,
118-
)
119-
self._warned_backend = True
120-
return None
117+
if backend == "openvino":
118+
try:
119+
import openvino as ov
120+
121+
core = ov.Core()
122+
devices = core.available_devices
123+
total = 0
124+
for dev in devices:
125+
stats = core.get_property(dev, "DEVICE_MEMORY_STATISTICS")
126+
total += stats.get("deviceUsedBytes", 0)
127+
return total / (1024**2)
128+
except Exception as e:
129+
warnings.warn(f"OVINO mem err: {e}", RuntimeWarning)
130+
return None
121131
except ImportError as e:
122-
warnings.warn(
123-
f"Could not import backend lib ({e}); GPU disabled.",
124-
RuntimeWarning,
125-
)
126-
return None
127-
except Exception as e:
128-
warnings.warn(f"Error retrieving GPU memory ({e}).", RuntimeWarning)
132+
warnings.warn(f"Import err for {backend}: {e}", RuntimeWarning)
129133
return None
134+
warnings.warn(f"Unsupported backend '{backend}'", RuntimeWarning)
135+
return None
130136

131-
def _log(self, label, step):
132-
cpu = self._get_cpu_memory()
133-
gpu = self._get_gpu_memory()
134-
msg = f"{label} - CPU Memory: {cpu:.2f} MB"
137+
def _log(self, label, step, cpu, gpu):
138+
msg = f"{label} - CPU: {cpu:.2f} MB"
135139
if gpu is not None:
136-
msg += f"; GPU Memory: {gpu:.2f} MB"
140+
msg += f"; GPU: {gpu:.2f} MB"
137141
print(msg)
138142
if self.tb_writer:
139-
import tensorflow as tf
143+
import tensorflow as tf
140144

141-
with self.tb_writer.as_default(step=int(step)):
145+
with self.tb_writer.as_default(step=step):
142146
tf.summary.scalar("Memory/CPU_MB", cpu)
143147
if gpu is not None:
144148
tf.summary.scalar("Memory/GPU_MB", gpu)
145149
self.tb_writer.flush()
146-
147-
def on_train_begin(self, logs=None):
148-
self._batch_count = 0
149-
150-
def on_epoch_begin(self, epoch, logs=None):
151-
self._log(f"Epoch {epoch} start", epoch)
152-
153-
def on_epoch_end(self, epoch, logs=None):
154-
self._log(f"Epoch {epoch} end", epoch + 1)
155-
156-
def on_batch_end(self, batch, logs=None):
157-
if self.log_every_batch:
158-
self._log(f"Batch {self._batch_count} end", self._batch_count)
159-
self._batch_count += 1
160-
161-
def on_train_end(self, logs=None):
162-
if self.tb_writer:
163-
self.tb_writer.close()

0 commit comments

Comments
 (0)