Skip to content

Commit c3d1252

Browse files
author
Lincoln Stein
committed
revert to old system for doing RAM <-> VRAM transfers; new way leaks memory
1 parent 84f5cbd commit c3d1252

File tree

5 files changed

+141
-16
lines changed

5 files changed

+141
-16
lines changed

invokeai/app/services/config/config_default.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
DB_FILE = Path("invokeai.db")
2525
LEGACY_INIT_FILE = Path("invokeai.init")
2626
DEFAULT_RAM_CACHE = 10.0
27+
DEFAULT_VRAM_CACHE = 0.25
2728
DEFAULT_CONVERT_CACHE = 20.0
2829
DEVICE = Literal["auto", "cpu", "cuda:0", "cuda:1", "cuda:2", "cuda:3", "cuda:4", "cuda:5", "cuda:6", "cuda:7", "mps"]
2930
PRECISION = Literal["auto", "float16", "bfloat16", "float32", "autocast"]
@@ -99,7 +100,9 @@ class InvokeAIAppConfig(BaseSettings):
99100
profile_prefix: An optional prefix for profile output files.
100101
profiles_dir: Path to profiles output directory.
101102
ram: Maximum memory amount used by memory model cache for rapid switching (GB).
103+
vram: Amount of VRAM reserved for model storage (GB).
102104
convert_cache: Maximum size of on-disk converted models cache (GB).
105+
lazy_offload: Keep models in VRAM until their space is needed.
103106
log_memory_usage: If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.
104107
device: Preferred execution device. `auto` will choose the device depending on the hardware platform and the installed torch capabilities.<br>Valid values: `auto`, `cpu`, `cuda:0`, `cuda:1`, `cuda:2`, `cuda:3`, `cuda:4`, `cuda:5`, `cuda:6`, `cuda:7`, `mps`
105108
devices: List of execution devices; will override default device selected.
@@ -167,7 +170,9 @@ class InvokeAIAppConfig(BaseSettings):
167170

168171
# CACHE
169172
ram: float = Field(default_factory=get_default_ram_cache_size, gt=0, description="Maximum memory amount used by memory model cache for rapid switching (GB).")
173+
vram: float = Field(default=DEFAULT_VRAM_CACHE, ge=0, description="Amount of VRAM reserved for model storage (GB).")
170174
convert_cache: float = Field(default=DEFAULT_CONVERT_CACHE, ge=0, description="Maximum size of on-disk converted models cache (GB).")
175+
lazy_offload: bool = Field(default=True, description="Keep models in VRAM until their space is needed.")
171176
log_memory_usage: bool = Field(default=False, description="If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.")
172177

173178
# DEVICE
@@ -366,9 +371,6 @@ def migrate_v3_config_dict(config_dict: dict[str, Any]) -> InvokeAIAppConfig:
366371
# `max_cache_size` was renamed to `ram` some time in v3, but both names were used
367372
if k == "max_cache_size" and "ram" not in category_dict:
368373
parsed_config_dict["ram"] = v
369-
# vram was removed in v4.0.2
370-
if k in ["vram", "max_vram_cache_size", "lazy_offload"]:
371-
continue
372374
# autocast was removed in v4.0.1
373375
if k == "precision" and v == "autocast":
374376
parsed_config_dict["precision"] = "auto"
@@ -419,22 +421,24 @@ def migrate_v4_0_0_config_dict(config_dict: dict[str, Any]) -> InvokeAIAppConfig
419421
def migrate_v4_0_1_config_dict(config_dict: dict[str, Any]) -> InvokeAIAppConfig:
420422
"""Migrate v4.0.1 config dictionary to a current config object.
421423
424+
A few new multi-GPU options were added in 4.0.2, and this simply
425+
updates the schema label.
426+
422427
Args:
423428
config_dict: A dictionary of settings from a v4.0.1 config file.
424429
425430
Returns:
426431
An instance of `InvokeAIAppConfig` with the migrated settings.
427432
"""
428433
parsed_config_dict: dict[str, Any] = {}
429-
for k, v in config_dict.items():
430-
if k not in ["vram", "lazy_offload"]:
431-
parsed_config_dict[k] = v
434+
for k, _ in config_dict.items():
432435
if k == "schema_version":
433436
parsed_config_dict[k] = CONFIG_SCHEMA_VERSION
434437
config = DefaultInvokeAIAppConfig.model_validate(parsed_config_dict)
435438
return config
436439

437440

441+
# TO DO: replace this with a formal registration and migration system
438442
def load_and_migrate_config(config_path: Path) -> InvokeAIAppConfig:
439443
"""Load and migrate a config file to the latest version.
440444

invokeai/app/services/model_manager/model_manager_default.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,8 @@ def build_model_manager(
7676

7777
ram_cache = ModelCache(
7878
max_cache_size=app_config.ram,
79+
max_vram_cache_size=app_config.vram,
80+
lazy_offloading=app_config.lazy_offload,
7981
logger=logger,
8082
)
8183
convert_cache = ModelConvertCache(cache_path=app_config.convert_cache_path, max_size=app_config.convert_cache)

invokeai/backend/model_manager/load/model_cache/model_cache_base.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,12 +113,28 @@ def get_execution_device(self) -> torch.device:
113113
"""
114114
pass
115115

116+
@property
117+
@abstractmethod
118+
def lazy_offloading(self) -> bool:
119+
"""Return true if the cache is configured to lazily offload models in VRAM."""
120+
pass
121+
116122
@property
117123
@abstractmethod
118124
def max_cache_size(self) -> float:
119125
"""Return true if the cache is configured to lazily offload models in VRAM."""
120126
pass
121127

128+
@abstractmethod
129+
def offload_unlocked_models(self, size_required: int) -> None:
130+
"""Offload from VRAM any models not actively in use."""
131+
pass
132+
133+
@abstractmethod
134+
def move_model_to_device(self, cache_entry: CacheRecord[AnyModel], target_device: torch.device) -> None:
135+
"""Move model into the indicated device."""
136+
pass
137+
122138
@property
123139
@abstractmethod
124140
def stats(self) -> Optional[CacheStats]:

invokeai/backend/model_manager/load/model_cache/model_cache_default.py

Lines changed: 100 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,10 @@
1919
"""
2020

2121
import gc
22+
import math
2223
import sys
2324
import threading
25+
import time
2426
from contextlib import contextmanager, suppress
2527
from logging import Logger
2628
from threading import BoundedSemaphore
@@ -29,7 +31,7 @@
2931
import torch
3032

3133
from invokeai.backend.model_manager import AnyModel, SubModelType
32-
from invokeai.backend.model_manager.load.memory_snapshot import MemorySnapshot
34+
from invokeai.backend.model_manager.load.memory_snapshot import MemorySnapshot, get_pretty_snapshot_diff
3335
from invokeai.backend.util.devices import TorchDevice
3436
from invokeai.backend.util.logging import InvokeAILogger
3537

@@ -40,6 +42,11 @@
4042
# Default is roughly enough to hold three fp16 diffusers models in RAM simultaneously
4143
DEFAULT_MAX_CACHE_SIZE = 6.0
4244

45+
# amount of GPU memory to hold in reserve for use by generations (GB)
46+
# Empirically this value seems to improve performance without starving other
47+
# processes.
48+
DEFAULT_MAX_VRAM_CACHE_SIZE = 0.25
49+
4350
# actual size of a gig
4451
GIG = 1073741824
4552

@@ -53,10 +60,12 @@ class ModelCache(ModelCacheBase[AnyModel]):
5360
def __init__(
5461
self,
5562
max_cache_size: float = DEFAULT_MAX_CACHE_SIZE,
63+
max_vram_cache_size: float = DEFAULT_MAX_VRAM_CACHE_SIZE,
5664
storage_device: torch.device = torch.device("cpu"),
5765
execution_devices: Optional[Set[torch.device]] = None,
5866
precision: torch.dtype = torch.float16,
5967
sequential_offload: bool = False,
68+
lazy_offloading: bool = True,
6069
sha_chunksize: int = 16777216,
6170
log_memory_usage: bool = False,
6271
logger: Optional[Logger] = None,
@@ -67,14 +76,18 @@ def __init__(
6776
:param max_cache_size: Maximum size of the RAM cache [6.0 GB]
6877
:param storage_device: Torch device to save inactive model in [torch.device('cpu')]
6978
:param precision: Precision for loaded models [torch.float16]
79+
:param lazy_offloading: Keep model in VRAM until another model needs to be loaded
7080
:param sequential_offload: Conserve VRAM by loading and unloading each stage of the pipeline sequentially
7181
:param log_memory_usage: If True, a memory snapshot will be captured before and after every model cache
7282
operation, and the result will be logged (at debug level). There is a time cost to capturing the memory
7383
snapshots, so it is recommended to disable this feature unless you are actively inspecting the model cache's
7484
behaviour.
7585
"""
86+
# allow lazy offloading only when vram cache enabled
87+
self._lazy_offloading = lazy_offloading and max_vram_cache_size > 0
7688
self._precision: torch.dtype = precision
7789
self._max_cache_size: float = max_cache_size
90+
self._max_vram_cache_size: float = max_vram_cache_size
7891
self._storage_device: torch.device = storage_device
7992
self._ram_lock = threading.Lock()
8093
self._logger = logger or InvokeAILogger.get_logger(self.__class__.__name__)
@@ -98,6 +111,11 @@ def logger(self) -> Logger:
98111
"""Return the logger used by the cache."""
99112
return self._logger
100113

114+
@property
115+
def lazy_offloading(self) -> bool:
116+
"""Return true if the cache is configured to lazily offload models in VRAM."""
117+
return self._lazy_offloading
118+
101119
@property
102120
def storage_device(self) -> torch.device:
103121
"""Return the storage device (e.g. "CPU" for RAM)."""
@@ -277,6 +295,87 @@ def _make_cache_key(self, model_key: str, submodel_type: Optional[SubModelType]
277295
else:
278296
return model_key
279297

298+
def offload_unlocked_models(self, size_required: int) -> None:
299+
"""Move any unused models from VRAM."""
300+
reserved = self._max_vram_cache_size * GIG
301+
vram_in_use = torch.cuda.memory_allocated() + size_required
302+
self.logger.debug(f"{(vram_in_use/GIG):.2f}GB VRAM needed for models; max allowed={(reserved/GIG):.2f}GB")
303+
for _, cache_entry in sorted(self._cached_models.items(), key=lambda x: x[1].size):
304+
if vram_in_use <= reserved:
305+
break
306+
if not cache_entry.loaded:
307+
continue
308+
if not cache_entry.locked:
309+
self.move_model_to_device(cache_entry, self.storage_device)
310+
cache_entry.loaded = False
311+
vram_in_use = torch.cuda.memory_allocated() + size_required
312+
self.logger.debug(
313+
f"Removing {cache_entry.key} from VRAM to free {(cache_entry.size/GIG):.2f}GB; vram free = {(torch.cuda.memory_allocated()/GIG):.2f}GB"
314+
)
315+
316+
TorchDevice.empty_cache()
317+
318+
def move_model_to_device(self, cache_entry: CacheRecord[AnyModel], target_device: torch.device) -> None:
319+
"""Move model into the indicated device.
320+
321+
:param cache_entry: The CacheRecord for the model
322+
:param target_device: The torch.device to move the model into
323+
324+
May raise a torch.cuda.OutOfMemoryError
325+
"""
326+
# These attributes are not in the base ModelMixin class but in various derived classes.
327+
# Some models don't have these attributes, in which case they run in RAM/CPU.
328+
self.logger.debug(f"Called to move {cache_entry.key} to {target_device}")
329+
if not (hasattr(cache_entry.model, "device") and hasattr(cache_entry.model, "to")):
330+
return
331+
332+
source_device = cache_entry.model.device
333+
334+
# Note: We compare device types only so that 'cuda' == 'cuda:0'.
335+
# This would need to be revised to support multi-GPU.
336+
if torch.device(source_device).type == torch.device(target_device).type:
337+
return
338+
339+
start_model_to_time = time.time()
340+
snapshot_before = self._capture_memory_snapshot()
341+
try:
342+
cache_entry.model.to(target_device)
343+
except Exception as e: # blow away cache entry
344+
self._delete_cache_entry(cache_entry)
345+
raise e
346+
347+
snapshot_after = self._capture_memory_snapshot()
348+
end_model_to_time = time.time()
349+
self.logger.debug(
350+
f"Moved model '{cache_entry.key}' from {source_device} to"
351+
f" {target_device} in {(end_model_to_time-start_model_to_time):.2f}s."
352+
f"Estimated model size: {(cache_entry.size/GIG):.3f} GB."
353+
f"{get_pretty_snapshot_diff(snapshot_before, snapshot_after)}"
354+
)
355+
356+
if (
357+
snapshot_before is not None
358+
and snapshot_after is not None
359+
and snapshot_before.vram is not None
360+
and snapshot_after.vram is not None
361+
):
362+
vram_change = abs(snapshot_before.vram - snapshot_after.vram)
363+
364+
# If the estimated model size does not match the change in VRAM, log a warning.
365+
if not math.isclose(
366+
vram_change,
367+
cache_entry.size,
368+
rel_tol=0.1,
369+
abs_tol=10 * MB,
370+
):
371+
self.logger.debug(
372+
f"Moving model '{cache_entry.key}' from {source_device} to"
373+
f" {target_device} caused an unexpected change in VRAM usage. The model's"
374+
" estimated size may be incorrect. Estimated model size:"
375+
f" {(cache_entry.size/GIG):.3f} GB.\n"
376+
f"{get_pretty_snapshot_diff(snapshot_before, snapshot_after)}"
377+
)
378+
280379
def print_cuda_stats(self) -> None:
281380
"""Log CUDA diagnostics."""
282381
vram = "%4.2fG" % (torch.cuda.memory_allocated() / GIG)

invokeai/backend/model_manager/load/model_cache/model_locker.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
Base class and implementation of a class that moves models in and out of VRAM.
33
"""
44

5-
import copy
65
from typing import Optional
76

87
import torch
@@ -55,13 +54,14 @@ def lock(self) -> AnyModel:
5554
# NOTE that the model has to have the to() method in order for this code to move it into GPU!
5655
self._cache_entry.lock()
5756
try:
58-
# We wait for a gpu to be free - may raise a ValueError
59-
self._execution_device = self._cache.get_execution_device()
60-
self._cache.logger.debug(f"Locking {self._cache_entry.key} in {self._execution_device}")
61-
model_in_gpu = copy.deepcopy(self._cache_entry.model)
62-
if hasattr(model_in_gpu, "to"):
63-
model_in_gpu.to(self._execution_device)
57+
if self._cache.lazy_offloading:
58+
self._cache.offload_unlocked_models(self._cache_entry.size)
59+
60+
execution_device = self._cache.get_execution_device()
61+
self._cache.move_model_to_device(self._cache_entry, execution_device)
6462
self._cache_entry.loaded = True
63+
64+
self._cache.logger.debug(f"Locking {self._cache_entry.key} in {execution_device}")
6565
self._cache.print_cuda_stats()
6666
except torch.cuda.OutOfMemoryError:
6767
self._cache.logger.warning("Insufficient GPU memory to load model. Aborting")
@@ -70,11 +70,15 @@ def lock(self) -> AnyModel:
7070
except Exception:
7171
self._cache_entry.unlock()
7272
raise
73-
return model_in_gpu
73+
74+
return self.model
7475

7576
def unlock(self) -> None:
7677
"""Call upon exit from context."""
7778
if not hasattr(self.model, "to"):
7879
return
80+
7981
self._cache_entry.unlock()
80-
self._cache.print_cuda_stats()
82+
if not self._cache.lazy_offloading:
83+
self._cache.offload_unlocked_models(self._cache_entry.size)
84+
self._cache.print_cuda_stats()

0 commit comments

Comments
 (0)