Skip to content

Commit 19eff36

Browse files
committed
feat: LoRA hotswapping for server
1 parent ef93670 commit 19eff36

File tree

1 file changed

+47
-3
lines changed

1 file changed

+47
-3
lines changed

llama_cpp/server/model.py

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import json
44

5-
from typing import Dict, Optional, Union, List
5+
from typing import Any, Dict, Optional, Union, List
66

77
import llama_cpp
88
import llama_cpp.llama_speculative as llama_speculative
@@ -28,12 +28,19 @@ def __init__(self, models: List[ModelSettings]) -> None:
2828
self._default_model_alias: str = self._default_model_settings.model_alias # type: ignore
2929

3030
# Load default model
31+
32+
if self._default_model_settings.verbose:
33+
print(f"Loading default model {self._default_model_alias}")
3134
self._current_model = self.load_llama_from_model_settings(
3235
self._default_model_settings
3336
)
3437
self._current_model_alias = self._default_model_alias
3538

3639
def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama:
40+
"""Get the Llama model for the given alias, or the default model otherwise.
41+
This may result in model loading, or in hot-swapping if a compatible model
42+
is already loaded and only LoRA adapters need to be changed.
43+
"""
3744
if model is None:
3845
model = self._default_model_alias
3946

@@ -44,12 +51,49 @@ def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama:
4451
if self._current_model is not None:
4552
return self._current_model
4653

54+
new_settings = self._model_settings_dict[model]
55+
56+
if self._current_model is not None and self._current_model_alias is not None:
57+
current_settings = self._model_settings_dict[self._current_model_alias]
58+
59+
def hot_swappable_settings(settings: ModelSettings) -> Dict[str, Any]:
60+
"""Subset of settings used to check if models can be hot-swapped"""
61+
values = settings.model_dump()
62+
values.pop('model_alias', None) # The model alias doesn't matter
63+
values.pop('lora_adapters', None) # Different LoRA adapters can be hot-swapped
64+
return values
65+
66+
if hot_swappable_settings(new_settings) == hot_swappable_settings(current_settings):
67+
# We can hot-swap! First, zero out existing LoRAs
68+
if current_settings.verbose:
69+
print(f"Hot-swapping model, setting existing LoRA adapter scales to 0.0.")
70+
if self._current_model.lora_adapters is not None:
71+
for lora_path in self._current_model.lora_adapters:
72+
self._current_model.set_lora_adapter_scale(lora_path, 0.0)
73+
74+
# Now enable new LoRAs
75+
if new_settings.lora_adapters is not None:
76+
if new_settings.verbose:
77+
print(f"Hot-swapping model, setting LoRA adapter scales for {model}.")
78+
for lora_path, scale in new_settings.lora_adapters.items():
79+
self._current_model.set_lora_adapter_scale(
80+
lora_path,
81+
scale,
82+
load_if_needed=True
83+
)
84+
85+
self._current_model_alias = model
86+
return self._current_model
87+
4788
if self._current_model:
89+
if current_settings.verbose:
90+
print(f"Switching model, unloading current model {self._current_model}")
4891
self._current_model.close()
4992
self._current_model = None
5093

51-
settings = self._model_settings_dict[model]
52-
self._current_model = self.load_llama_from_model_settings(settings)
94+
if new_settings.verbose:
95+
print(f"Switching model, loading new model {model}")
96+
self._current_model = self.load_llama_from_model_settings(new_settings)
5397
self._current_model_alias = model
5498
return self._current_model
5599

0 commit comments

Comments
 (0)