feat: Multi-LoRA changes to match Llama wrapper for server

richdougherty · richdougherty · commit a3766dd6e714 · 2024-10-30T23:05:13.000+13:00
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
@@ -268,8 +268,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
             # Sampling Params
             last_n_tokens_size=settings.last_n_tokens_size,
             # LoRA Params
-            lora_base=settings.lora_base,
-            lora_path=settings.lora_path,
+            lora_adapters=settings.lora_adapters,
             # Backend Params
             numa=settings.numa,
             # Chat Format Params
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
@@ -113,13 +113,9 @@ class ModelSettings(BaseSettings):
         description="Last n tokens to keep for repeat penalty calculation.",
     )
     # LoRA Params
-    lora_base: Optional[str] = Field(
+    lora_adapters: Optional[Dict[str, float]]= Field(
         default=None,
-        description="Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.",
-    )
-    lora_path: Optional[str] = Field(
-        default=None,
-        description="Path to a LoRA file to apply to the model.",
+        description="Paths to LoRA adapter files and the scale to apply to them at (scale of 0.0 will not be used during inference).",
     )
     # Backend Params
     numa: Union[bool, int] = Field(