2
2
3
3
import json
4
4
5
- from typing import Dict , Optional , Union , List
5
+ from typing import Any , Dict , Optional , Union , List
6
6
7
7
import llama_cpp
8
8
import llama_cpp .llama_speculative as llama_speculative
@@ -28,12 +28,19 @@ def __init__(self, models: List[ModelSettings]) -> None:
28
28
self ._default_model_alias : str = self ._default_model_settings .model_alias # type: ignore
29
29
30
30
# Load default model
31
+
32
+ if self ._default_model_settings .verbose :
33
+ print (f"Loading default model { self ._default_model_alias } " )
31
34
self ._current_model = self .load_llama_from_model_settings (
32
35
self ._default_model_settings
33
36
)
34
37
self ._current_model_alias = self ._default_model_alias
35
38
36
39
def __call__ (self , model : Optional [str ] = None ) -> llama_cpp .Llama :
40
+ """Get the Llama model for the given alias, or the default model otherwise.
41
+ This may result in model loading, or in hot-swapping if a compatible model
42
+ is already loaded and only LoRA adapters need to be changed.
43
+ """
37
44
if model is None :
38
45
model = self ._default_model_alias
39
46
@@ -44,12 +51,49 @@ def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama:
44
51
if self ._current_model is not None :
45
52
return self ._current_model
46
53
54
+ new_settings = self ._model_settings_dict [model ]
55
+
56
+ if self ._current_model is not None and self ._current_model_alias is not None :
57
+ current_settings = self ._model_settings_dict [self ._current_model_alias ]
58
+
59
+ def hot_swappable_settings (settings : ModelSettings ) -> Dict [str , Any ]:
60
+ """Subset of settings used to check if models can be hot-swapped"""
61
+ values = settings .model_dump ()
62
+ values .pop ('model_alias' , None ) # The model alias doesn't matter
63
+ values .pop ('lora_adapters' , None ) # Different LoRA adapters can be hot-swapped
64
+ return values
65
+
66
+ if hot_swappable_settings (new_settings ) == hot_swappable_settings (current_settings ):
67
+ # We can hot-swap! First, zero out existing LoRAs
68
+ if current_settings .verbose :
69
+ print (f"Hot-swapping model, setting existing LoRA adapter scales to 0.0." )
70
+ if self ._current_model .lora_adapters is not None :
71
+ for lora_path in self ._current_model .lora_adapters :
72
+ self ._current_model .set_lora_adapter_scale (lora_path , 0.0 )
73
+
74
+ # Now enable new LoRAs
75
+ if new_settings .lora_adapters is not None :
76
+ if new_settings .verbose :
77
+ print (f"Hot-swapping model, setting LoRA adapter scales for { model } ." )
78
+ for lora_path , scale in new_settings .lora_adapters .items ():
79
+ self ._current_model .set_lora_adapter_scale (
80
+ lora_path ,
81
+ scale ,
82
+ load_if_needed = True
83
+ )
84
+
85
+ self ._current_model_alias = model
86
+ return self ._current_model
87
+
47
88
if self ._current_model :
89
+ if current_settings .verbose :
90
+ print (f"Switching model, unloading current model { self ._current_model } " )
48
91
self ._current_model .close ()
49
92
self ._current_model = None
50
93
51
- settings = self ._model_settings_dict [model ]
52
- self ._current_model = self .load_llama_from_model_settings (settings )
94
+ if new_settings .verbose :
95
+ print (f"Switching model, loading new model { model } " )
96
+ self ._current_model = self .load_llama_from_model_settings (new_settings )
53
97
self ._current_model_alias = model
54
98
return self ._current_model
55
99
0 commit comments