feat: Update multi LoRA support in high-level Llama wrapper

richdougherty · richdougherty · commit 30887d26954e · 2024-10-29T14:44:30.000+13:00
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -96,9 +96,7 @@ def __init__(
         # Sampling Params
         last_n_tokens_size: int = 64,
         # LoRA Params
-        lora_base: Optional[str] = None,
-        lora_scale: float = 1.0,
-        lora_path: Optional[str] = None,
+        lora_adapters: Optional[Dict[str, float]] = None,
         # Backend Params
         numa: Union[bool, int] = False,
         # Chat Format Params
@@ -174,8 +172,7 @@ def __init__(
             offload_kqv: Offload K, Q, V to GPU.
             flash_attn: Use flash attention.
             last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
-            lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
-            lora_path: Path to a LoRA file to apply to the model.
+            lora_adapters: Paths to LoRA adapter files and the scale to apply to them at (scale of 0.0 will not be used during inference).
             numa: numa policy
             chat_format: String specifying the chat format to use when calling create_chat_completion.
             chat_handler: Optional chat handler to use when calling create_chat_completion.
@@ -243,7 +240,7 @@ def __init__(
             )  # keep a reference to the array so it is not gc'd
             self.model_params.tensor_split = self._c_tensor_split
         self.model_params.vocab_only = vocab_only
-        self.model_params.use_mmap = use_mmap if lora_path is None else False
+        self.model_params.use_mmap = use_mmap
         self.model_params.use_mlock = use_mlock
 
         # kv_overrides is the original python dict
@@ -355,9 +352,7 @@ def __init__(
 
         self.cache: Optional[BaseLlamaCache] = None
 
-        self.lora_base = lora_base
-        self.lora_scale = lora_scale
-        self.lora_path = lora_path
+        self.lora_adapters = lora_adapters
 
         self.spm_infill = spm_infill
 
@@ -406,32 +401,10 @@ def __init__(
             )
         )
 
-        self._lora_adapter: Optional[llama_cpp.llama_lora_adapter_p] = None
-
-        if self.lora_path:
-            self._lora_adapter = llama_cpp.llama_lora_adapter_init(
-                self._model.model,
-                self.lora_path.encode("utf-8"),
-            )
-            if self._lora_adapter is None:
-                raise RuntimeError(
-                    f"Failed to initialize LoRA adapter from lora path: {self.lora_path}"
-                )
-
-            def free_lora_adapter():
-                if self._lora_adapter is None:
-                    return
-                llama_cpp.llama_lora_adapter_free(self._lora_adapter)
-                self._lora_adapter = None
-
-            self._stack.callback(free_lora_adapter)
-
-            if llama_cpp.llama_lora_adapter_set(
-                self._ctx.ctx, self._lora_adapter, self.lora_scale
-            ):
-                raise RuntimeError(
-                    f"Failed to set LoRA adapter from lora path: {self.lora_path}"
-                )
+        self._lora_adapters_by_path: Dict[str, internals.LlamaLoraAdapter] = {}
+        if self.lora_adapters:
+            for lora_path, scale  in self.lora_adapters.items():
+                self.set_lora_adapter(lora_path, scale, load_if_needed=True)
 
         if self.verbose:
             print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr)
@@ -621,6 +594,36 @@ def set_seed(self, seed: int):
             seed: The random seed.
         """
         self._seed = seed
+    
+    def set_lora_adapter(self, lora_path: str, scale: float, *, load_if_needed=False):
+        """
+        Set the scale for a LoRA adapter or 0.0 to disable it for inference. If the LoRA adapter file
+        has previously been loaded then this method will set its scale. If the LoRA adapter file has
+        not been previously loaded, this method will raise an exception, unless load_if_needed is set.
+        
+        Args:
+            lora_path: The path to the LoRA adapter. This path must have been loaded when the `Llama` object was created.
+            scale: The scaling factor to apply to the LoRA adapter. If 0.0, the LoRA adapter will be disabled so it won't be used during inference.
+            load_if_needed: Whether or not to load the adapter if it has not been previously been loaded. If True, this
+                method will attempt to load the adapter from the lora_path if needed. If False, loading an adapter that
+                hasn't already been loaded will raise an exception.
+        """
+        lora_adapter = self._lora_adapters_by_path.get(lora_path)
+        if lora_adapter is None:
+            lora_adapter = internals.LlamaLoraAdapter(
+                self._model,
+                lora_path,
+                verbose=self.verbose,
+            )
+            if lora_adapter is None:
+                raise RuntimeError(
+                    f"Failed to initialize LoRA adapter from lora path: {lora_path}"
+                )
+            self._lora_adapters_by_path[lora_path] = lora_adapter
+        if scale == 0.0:
+            self._ctx.lora_adapter_remove(lora_adapter) # Safe even if not in context
+        else:
+            self._ctx.lora_adapter_set(lora_adapter, scale)
 
     def reset(self):
         """Reset the model state."""
@@ -2096,9 +2099,7 @@ def __getstate__(self):
             # Sampling Params
             last_n_tokens_size=self.last_n_tokens_size,
             # LoRA Params
-            lora_base=self.lora_base,
-            lora_scale=self.lora_scale,
-            lora_path=self.lora_path,
+            lora_adapters=self.lora_adapters,
             # Backend Params
             numa=self.numa,
             # Chat Format Params