fix: Caching for hot-swapping LoRA adapters

richdougherty · richdougherty · commit 8e48f1c18b4f · 2024-10-30T23:03:17.000+13:00
diff --git a/docs/api-reference.md b/docs/api-reference.md
@@ -22,6 +22,7 @@ High-level Python bindings for llama.cpp.
             - __call__
             - create_chat_completion
             - create_chat_completion_openai_v1
+            - set_lora_adapter_scale
             - set_cache
             - save_state
             - load_state
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -18,6 +18,7 @@
     List,
     Literal,
     Optional,
+    Tuple,
     Union,
     Generator,
     Sequence,
@@ -352,7 +353,9 @@ def __init__(
 
         self.cache: Optional[BaseLlamaCache] = None
 
-        self.lora_adapters = lora_adapters
+        self.lora_adapters = (
+            lora_adapters if lora_adapters is None else {}
+        )
 
         self.spm_infill = spm_infill
 
@@ -401,10 +404,13 @@ def __init__(
             )
         )
 
-        self._lora_adapters_by_path: Dict[str, internals.LlamaLoraAdapter] = {}
-        if self.lora_adapters:
-            for lora_path, scale  in self.lora_adapters.items():
-                self.set_lora_adapter(lora_path, scale, load_if_needed=True)
+        # Dict from LoRA path to wrapper
+        self._lora_adapters_paths: Dict[str, internals.LlamaLoraAdapter] = {}
+        # Immutable value representing active adapters for use as a key
+        self._lora_adapters_active: Tuple[Tuple[str, float]] = ()
+
+        for lora_path, scale in self.lora_adapters.copy().items():
+            self.set_lora_adapter_scale(lora_path, scale, load_if_needed=True)
 
         if self.verbose:
             print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr)
@@ -426,6 +432,7 @@ def __init__(
         self._candidates = internals.LlamaTokenDataArray(n_vocab=self._n_vocab)
 
         self.n_tokens = 0
+        self.tokens_lora_adapters: Tuple[Tuple[str, float]] = () # Adapters that processed tokens
         self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc)
         self.scores: npt.NDArray[np.single] = np.ndarray(
             (n_ctx if logits_all == True else n_batch, self._n_vocab), dtype=np.single
@@ -595,7 +602,7 @@ def set_seed(self, seed: int):
         """
         self._seed = seed
     
-    def set_lora_adapter(self, lora_path: str, scale: float, *, load_if_needed=False):
+    def set_lora_adapter_scale(self, lora_path: str, scale: float, *, load_if_needed=False):
         """
         Set the scale for a LoRA adapter or 0.0 to disable it for inference. If the LoRA adapter file
         has previously been loaded then this method will set its scale. If the LoRA adapter file has
@@ -608,7 +615,8 @@ def set_lora_adapter(self, lora_path: str, scale: float, *, load_if_needed=False
                 method will attempt to load the adapter from the lora_path if needed. If False, loading an adapter that
                 hasn't already been loaded will raise an exception.
         """
-        lora_adapter = self._lora_adapters_by_path.get(lora_path)
+        # Load adapter if needed (even if scale 0.0)
+        lora_adapter = self._lora_adapters_paths.get(lora_path)
         if lora_adapter is None:
             lora_adapter = internals.LlamaLoraAdapter(
                 self._model,
@@ -619,15 +627,24 @@ def set_lora_adapter(self, lora_path: str, scale: float, *, load_if_needed=False
                 raise RuntimeError(
                     f"Failed to initialize LoRA adapter from lora path: {lora_path}"
                 )
-            self._lora_adapters_by_path[lora_path] = lora_adapter
+            self._lora_adapters_paths[lora_path] = lora_adapter
+
         if scale == 0.0:
-            self._ctx.lora_adapter_remove(lora_adapter) # Safe even if not in context
+            # Remove from context; safe to call even if not in context
+            self._ctx.lora_adapter_remove(lora_adapter)
         else:
+            # Set scale in context
             self._ctx.lora_adapter_set(lora_adapter, scale)
 
+        self.lora_adapters[lora_path] = scale
+        self._lora_adapters_active = tuple(sorted(
+            filter(lambda path_scale: path_scale[1] != 0.0, self.lora_adapters.items())
+        ))
+
     def reset(self):
         """Reset the model state."""
         self.n_tokens = 0
+        self.tokens_lora_adapters = self._lora_adapters_active
 
     def eval(self, tokens: Sequence[int]):
         """Evaluate a list of tokens.
@@ -879,7 +896,7 @@ def generate(
         )
 
         # Check for kv cache prefix match
-        if reset and self.n_tokens > 0:
+        if reset and self.n_tokens > 0 and self.tokens_lora_adapters == self._lora_adapters_active:
             longest_prefix = 0
             for a, b in zip(self._input_ids, tokens[:-1]):
                 if a == b:
@@ -1296,7 +1313,7 @@ def logit_bias_processor(
 
         if self.cache:
             try:
-                cache_item = self.cache[prompt_tokens]
+                cache_item = self.cache[(self._lora_adapters_active, prompt_tokens)]
                 cache_prefix_len = Llama.longest_token_prefix(
                     cache_item.input_ids.tolist(), prompt_tokens
                 )
@@ -1634,15 +1651,15 @@ def logit_bias_processor(
             if self.cache:
                 if self.verbose:
                     print("Llama._create_completion: cache save", file=sys.stderr)
-                self.cache[prompt_tokens + completion_tokens] = self.save_state()
+                self.cache[(self._lora_adapters_active, prompt_tokens + completion_tokens)] = self.save_state()
                 if self.verbose:
                     print("Llama._create_completion: cache saved", file=sys.stderr)
             return
 
         if self.cache:
             if self.verbose:
                 print("Llama._create_completion: cache save", file=sys.stderr)
-            self.cache[prompt_tokens + completion_tokens] = self.save_state()
+            self.cache[(self._lora_adapters_active, prompt_tokens + completion_tokens)] = self.save_state()
 
         text_str = text.decode("utf-8", errors="ignore")