From 5cc40d77b9c7d4a0ec77558858d80ce2a0910b24 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 17:11:00 +0200
Subject: [PATCH 001/177] Create linter.yml

---
 .github/workflows/linter.yml | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100644 .github/workflows/linter.yml

diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml
new file mode 100644
index 000000000..c85ec6654
--- /dev/null
+++ b/.github/workflows/linter.yml
@@ -0,0 +1,27 @@
+name: Ruff
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    # The branches below must be a subset of the branches above
+    branches: [ "main" ]
+  schedule:
+    - cron: '30 3 * * *'
+
+permissions:
+  contents: read
+
+jobs:
+  ruff:
+    name: Ruff
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+          cache: 'pip'
+      - uses: chartboost/ruff-action@v1
+        with:
+          args: 'check --select B,F,I,PERF,NPY,PL,RUF,S,SIM,UP --output-format github --diff --exclude docker,docs,scripts,tests,vendor'

From fb825d5fa4c4473fd04aaac769e873defaa75632 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 17:11:50 +0200
Subject: [PATCH 002/177] Create fixer.yml

---
 .github/fixer.yml | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100644 .github/fixer.yml

diff --git a/.github/fixer.yml b/.github/fixer.yml
new file mode 100644
index 000000000..7ea87c527
--- /dev/null
+++ b/.github/fixer.yml
@@ -0,0 +1,34 @@
+name: Fixer
+
+on: [push, pull_request]
+
+concurrency:
+  group: fixer-${{ github.event_name == 'pull_request' && format('{0}-{1}', github.workflow, github.event.pull_request.number) || github.workflow_ref }}
+  cancel-in-progress: true
+
+jobs:
+  ruff-lint:
+    name: Ruff
+    runs-on: ubuntu-latest
+    permissions:
+      # Give the default GITHUB_TOKEN write permission to commit and push the
+      # added or changed files to the repository.
+      contents: write
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ github.head_ref }}
+  
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+          cache: 'pip'
+      
+      - uses: chartboost/ruff-action@v1
+        with:
+          args: 'check --fix-only'
+
+      - uses: stefanzweifel/git-auto-commit-action@v5
+        with:
+          commit_message: 'style fixes by ruff'

From 2b513d0d2361325446f5abce7171cd22e99e1833 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 17:13:46 +0200
Subject: [PATCH 003/177] Rename .github/fixer.yml to
 .github/workflows/fixer.yml

---
 .github/{ => workflows}/fixer.yml | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename .github/{ => workflows}/fixer.yml (100%)

diff --git a/.github/fixer.yml b/.github/workflows/fixer.yml
similarity index 100%
rename from .github/fixer.yml
rename to .github/workflows/fixer.yml

From 41adecd03a0e23ebbe159b8a1bab6fe42e5dc9be Mon Sep 17 00:00:00 2001
From: Smartappli <Smartappli@users.noreply.github.com>
Date: Fri, 2 Aug 2024 15:14:04 +0000
Subject: [PATCH 004/177] style fixes by ruff

---
 examples/batch-processing/server.py              | 1 -
 examples/low_level_api/Chat.py                   | 4 +++-
 examples/low_level_api/Miku.py                   | 3 ++-
 examples/low_level_api/ReasonAct.py              | 5 +++--
 examples/low_level_api/low_level_api_chat_cpp.py | 6 +++---
 llama_cpp/llama.py                               | 5 +----
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/examples/batch-processing/server.py b/examples/batch-processing/server.py
index 0b36746f9..a7e5c8c38 100644
--- a/examples/batch-processing/server.py
+++ b/examples/batch-processing/server.py
@@ -23,7 +23,6 @@
 
 app = FastAPI()
 
-import openai.types.chat as types
 
 
 @app.post("/v1/chat/completions")
diff --git a/examples/low_level_api/Chat.py b/examples/low_level_api/Chat.py
index a755089b2..eba3ba33f 100644
--- a/examples/low_level_api/Chat.py
+++ b/examples/low_level_api/Chat.py
@@ -1,5 +1,7 @@
 #!/bin/python
-import sys, os, datetime
+import sys
+import os
+import datetime
 from common import GptParams
 from low_level_api_chat_cpp import LLaMAInteract
 
diff --git a/examples/low_level_api/Miku.py b/examples/low_level_api/Miku.py
index e072ab1b1..257ffd938 100644
--- a/examples/low_level_api/Miku.py
+++ b/examples/low_level_api/Miku.py
@@ -1,5 +1,6 @@
 #!/bin/python
-import sys, os
+import sys
+import os
 from common import GptParams
 from low_level_api_chat_cpp import LLaMAInteract
 
diff --git a/examples/low_level_api/ReasonAct.py b/examples/low_level_api/ReasonAct.py
index 1f2c59017..eb3043bdb 100644
--- a/examples/low_level_api/ReasonAct.py
+++ b/examples/low_level_api/ReasonAct.py
@@ -1,5 +1,6 @@
 #!/bin/python
-import sys, os, datetime
+import sys
+import os
 from common import GptParams
 from low_level_api_chat_cpp import LLaMAInteract
 
@@ -12,7 +13,7 @@ def env_or_def(env, default):
 
 MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin")
 
-prompt = f"""You run in a loop of Thought, Action, Observation.
+prompt = """You run in a loop of Thought, Action, Observation.
 At the end of the loop either Answer or restate your Thought and Action.
 Use Thought to describe your thoughts about the question you have been asked.
 Use Action to run one of these actions available to you:
diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py
index 39081be17..347427576 100644
--- a/examples/low_level_api/low_level_api_chat_cpp.py
+++ b/examples/low_level_api/low_level_api_chat_cpp.py
@@ -174,7 +174,7 @@ def __init__(self, params: GptParams) -> None:
                     file=sys.stderr,
                 )
             else:
-                print(f"session file does not exist, will create", file=sys.stderr)
+                print("session file does not exist, will create", file=sys.stderr)
 
         # tokenize the prompt
         self.embd = []
@@ -197,7 +197,7 @@ def __init__(self, params: GptParams) -> None:
                 self.n_matching_session_tokens += 1
 
             if self.n_matching_session_tokens >= len(self.embd_inp):
-                print(f"session file has exact match for prompt!")
+                print("session file has exact match for prompt!")
             elif self.n_matching_session_tokens < (len(self.embd_inp) / 2):
                 print(
                     f"warning: session file has low similarity to prompt ({self.n_matching_session_tokens} / {len(self.embd_inp)} tokens); will mostly be reevaluated"
@@ -668,7 +668,7 @@ def output(self):
                 self.multibyte_fix[self.multibyte_fix.index(None)] = cur_char
 
             # Return completed utf char
-            if len(self.multibyte_fix) > 0 and not None in self.multibyte_fix:
+            if len(self.multibyte_fix) > 0 and None not in self.multibyte_fix:
                 yield (b"".join(self.multibyte_fix)).decode("utf8")
                 self.multibyte_fix = []
                 continue
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 0cb5ca2fc..6c39e932a 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -32,10 +32,7 @@
 from .llama_types import *
 from .llama_grammar import LlamaGrammar
 from .llama_cache import (
-    BaseLlamaCache,
-    LlamaCache,  # type: ignore
-    LlamaDiskCache,  # type: ignore
-    LlamaRAMCache,  # type: ignore
+    BaseLlamaCache,  # type: ignore
 )
 from .llama_tokenizer import BaseLlamaTokenizer, LlamaTokenizer
 import llama_cpp.llama_cpp as llama_cpp

From 72ecb43ed22b9275f5f6e5bea7e34b6b2d2a5651 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 17:18:31 +0200
Subject: [PATCH 005/177] Lint

---
 llama_cpp/_logger.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/_logger.py b/llama_cpp/_logger.py
index 7638170a9..0f3147346 100644
--- a/llama_cpp/_logger.py
+++ b/llama_cpp/_logger.py
@@ -1,6 +1,6 @@
-import sys
 import ctypes
 import logging
+import sys
 
 import llama_cpp
 

From 57589a051fb6caf89b2b21301f60974d07feda13 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 17:19:07 +0200
Subject: [PATCH 006/177] Lint

---
 examples/hf_pull/main.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/hf_pull/main.py b/examples/hf_pull/main.py
index dfed17516..3264cb15a 100644
--- a/examples/hf_pull/main.py
+++ b/examples/hf_pull/main.py
@@ -1,7 +1,6 @@
 import llama_cpp
 import llama_cpp.llama_tokenizer
 
-
 llama = llama_cpp.Llama.from_pretrained(
     repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
     filename="*q8_0.gguf",

From f0a719c0a5036ee87102173ccca7c6c465162df1 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 17:19:44 +0200
Subject: [PATCH 007/177] Lint

---
 examples/high_level_api/fastapi_server.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/high_level_api/fastapi_server.py b/examples/high_level_api/fastapi_server.py
index ee59767d6..4d003bcc2 100644
--- a/examples/high_level_api/fastapi_server.py
+++ b/examples/high_level_api/fastapi_server.py
@@ -26,6 +26,7 @@
 """
 
 import os
+
 import uvicorn
 
 from llama_cpp.server.app import create_app

From 8b3cf53f84c7943379c81ceb74e29e45483bffcf Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 17:20:36 +0200
Subject: [PATCH 008/177] Lint

---
 examples/low_level_api/Miku.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/low_level_api/Miku.py b/examples/low_level_api/Miku.py
index 257ffd938..8d99b0223 100644
--- a/examples/low_level_api/Miku.py
+++ b/examples/low_level_api/Miku.py
@@ -1,6 +1,7 @@
 #!/bin/python
-import sys
 import os
+import sys
+
 from common import GptParams
 from low_level_api_chat_cpp import LLaMAInteract
 

From bab328c41a38e6cbeb8b249129e9ca7710eb9702 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 17:21:21 +0200
Subject: [PATCH 009/177] Lint

---
 examples/high_level_api/high_level_api_inference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/high_level_api/high_level_api_inference.py b/examples/high_level_api/high_level_api_inference.py
index e41f37577..349445c7c 100644
--- a/examples/high_level_api/high_level_api_inference.py
+++ b/examples/high_level_api/high_level_api_inference.py
@@ -1,5 +1,5 @@
-import json
 import argparse
+import json
 
 from llama_cpp import Llama
 

From 70a18f3432ec6d7965453c5b290caab7eafa3bfc Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 17:22:05 +0200
Subject: [PATCH 010/177] Lint

---
 examples/high_level_api/high_level_api_streaming.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/high_level_api/high_level_api_streaming.py b/examples/high_level_api/high_level_api_streaming.py
index 747c6130e..868cdee7c 100644
--- a/examples/high_level_api/high_level_api_streaming.py
+++ b/examples/high_level_api/high_level_api_streaming.py
@@ -1,5 +1,5 @@
-import json
 import argparse
+import json
 
 from llama_cpp import Llama
 

From c4dd629634caf8130260344b20577952dcb105c2 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 17:23:59 +0200
Subject: [PATCH 011/177] Lint

---
 llama_cpp/llama_cache.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llama_cpp/llama_cache.py b/llama_cpp/llama_cache.py
index 5220c7933..000ce1581 100644
--- a/llama_cpp/llama_cache.py
+++ b/llama_cpp/llama_cache.py
@@ -1,11 +1,11 @@
 import sys
 from abc import ABC, abstractmethod
+from collections import OrderedDict
 from typing import (
     Optional,
     Sequence,
     Tuple,
 )
-from collections import OrderedDict
 
 import diskcache
 
@@ -52,7 +52,7 @@ class LlamaRAMCache(BaseLlamaCache):
     def __init__(self, capacity_bytes: int = (2 << 30)):
         super().__init__(capacity_bytes)
         self.capacity_bytes = capacity_bytes
-        self.cache_state: OrderedDict[Tuple[int, ...], "llama_cpp.llama.LlamaState"] = (
+        self.cache_state: OrderedDict[Tuple[int, ...], llama_cpp.llama.LlamaState] = (
             OrderedDict()
         )
 
@@ -132,7 +132,7 @@ def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState":
         _key = self._find_longest_prefix_key(key)
         if _key is None:
             raise KeyError("Key not found")
-        value: "llama_cpp.llama.LlamaState" = self.cache.pop(_key)  # type: ignore
+        value: llama_cpp.llama.LlamaState = self.cache.pop(_key)  # type: ignore
         # NOTE: This puts an integer as key in cache, which breaks,
         # Llama.longest_token_prefix(k, key) above since k is not a tuple of ints/tokens
         # self.cache.push(_key, side="front")  # type: ignore

From 4aa4000c8328a03bcf53de4423c84aabc3625da7 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 17:24:40 +0200
Subject: [PATCH 012/177] Lint

---
 examples/low_level_api/low_level_api_llama_cpp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/low_level_api/low_level_api_llama_cpp.py b/examples/low_level_api/low_level_api_llama_cpp.py
index ba3545771..f3e8eb569 100644
--- a/examples/low_level_api/low_level_api_llama_cpp.py
+++ b/examples/low_level_api/low_level_api_llama_cpp.py
@@ -1,6 +1,6 @@
 import ctypes
-import os
 import multiprocessing
+import os
 
 import llama_cpp
 

From 9a7a0cdcda2682cb9722fef83dfe546a317adde1 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 17:26:44 +0200
Subject: [PATCH 013/177] Lint

---
 examples/high_level_api/langchain_custom_llm.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/high_level_api/langchain_custom_llm.py b/examples/high_level_api/langchain_custom_llm.py
index b91632f5b..c01f7c387 100644
--- a/examples/high_level_api/langchain_custom_llm.py
+++ b/examples/high_level_api/langchain_custom_llm.py
@@ -1,9 +1,9 @@
 import argparse
-
-from llama_cpp import Llama
+from typing import Any, List, Mapping, Optional
 
 from langchain.llms.base import LLM
-from typing import Optional, List, Mapping, Any
+
+from llama_cpp import Llama
 
 
 class LlamaLLM(LLM):
@@ -42,8 +42,8 @@ def _identifying_params(self) -> Mapping[str, Any]:
 print(f"Answer: {answer.strip()}")
 
 # Using in a chain
-from langchain.prompts import PromptTemplate
 from langchain.chains import LLMChain
+from langchain.prompts import PromptTemplate
 
 prompt = PromptTemplate(
     input_variables=["product"],

From 7ba0fa4d531d8bfb957e9430318402525a5126b4 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 17:29:31 +0200
Subject: [PATCH 014/177] Lint

---
 llama_cpp/_utils.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llama_cpp/_utils.py b/llama_cpp/_utils.py
index 29628193b..729672f12 100644
--- a/llama_cpp/_utils.py
+++ b/llama_cpp/_utils.py
@@ -1,6 +1,5 @@
 import os
 import sys
-
 from typing import Any, Dict
 
 # Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor
@@ -11,7 +10,7 @@
 STDERR_FILENO = 2
 
 
-class suppress_stdout_stderr(object):
+class suppress_stdout_stderr:
     # NOTE: these must be "saved" here to avoid exceptions when using
     #       this context manager inside of a __del__ method
     sys = sys
@@ -69,7 +68,7 @@ def __call__(cls, *args: Any, **kwargs: Any) -> Any:
         return cls._instances[cls]
 
 
-class Singleton(object, metaclass=MetaSingleton):
+class Singleton(metaclass=MetaSingleton):
     """
     Base class for implementing the Singleton pattern.
     """

From fd20768a83f89cdf7a2a84ba4d231f321b6fcab2 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 17:31:26 +0200
Subject: [PATCH 015/177] Lint

---
 llama_cpp/server/settings.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
index b20655813..cab184cd9 100644
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@@ -1,13 +1,12 @@
 from __future__ import annotations
 
 import multiprocessing
-
-from typing import Optional, List, Literal, Union, Dict, cast
-from typing_extensions import Self
-
+from typing import Dict, List, Literal, Optional, Union, cast
+ 
 from pydantic import Field, model_validator
 from pydantic_settings import BaseSettings
-
+from typing_extensions import Self
+ 
 import llama_cpp
 
 # Disable warning for model and model_alias settings

From 3f75d6999ac30b8f55811da950312cecf4e7fcd5 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 17:31:58 +0200
Subject: [PATCH 016/177] Lint

---
 llama_cpp/llama_speculative.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llama_cpp/llama_speculative.py b/llama_cpp/llama_speculative.py
index 39dfb903b..ab77fe9e3 100644
--- a/llama_cpp/llama_speculative.py
+++ b/llama_cpp/llama_speculative.py
@@ -1,5 +1,4 @@
 import abc
-
 from typing import Any
 
 import numpy as np

From 6c2588049aaf00f0dce9e5a9802347b5a83be6b1 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 17:33:08 +0200
Subject: [PATCH 017/177] Lint

---
 examples/low_level_api/util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/low_level_api/util.py b/examples/low_level_api/util.py
index ef8b1c1ee..93d664aed 100644
--- a/examples/low_level_api/util.py
+++ b/examples/low_level_api/util.py
@@ -45,7 +45,7 @@ def append(self, elem):
 
     def __getitem__(self, val):
         if isinstance(val, int):
-            if 0 > val or val >= self.size:
+            if val < 0 or val >= self.size:
                 raise IndexError("Index out of range")
             return (
                 self.list[val]

From 9bdb1d7d99e2a1849896597096a6b2bd1ef5890e Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 17:36:22 +0200
Subject: [PATCH 018/177] Lint

---
 examples/low_level_api/quantize.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/low_level_api/quantize.py b/examples/low_level_api/quantize.py
index 057ac389e..b6c4fa7a9 100644
--- a/examples/low_level_api/quantize.py
+++ b/examples/low_level_api/quantize.py
@@ -1,5 +1,6 @@
-import os
 import argparse
+import os
+
 import llama_cpp
 
 

From 366542856e1e30ae885848769d1138a8c881a45d Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 17:37:08 +0200
Subject: [PATCH 019/177] Lint

---
 examples/low_level_api/common.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/low_level_api/common.py b/examples/low_level_api/common.py
index a0212ff0d..e108b848f 100644
--- a/examples/low_level_api/common.py
+++ b/examples/low_level_api/common.py
@@ -1,7 +1,6 @@
-import os
 import argparse
+import os
 import re
-
 from dataclasses import dataclass, field
 from typing import List
 

From 39cd74283982ea5bd1d3f26cd610ab6ed7f3487d Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 17:42:22 +0200
Subject: [PATCH 020/177] Lint

---
 llama_cpp/server/errors.py | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/llama_cpp/server/errors.py b/llama_cpp/server/errors.py
index fbf9fd80d..51c0568f4 100644
--- a/llama_cpp/server/errors.py
+++ b/llama_cpp/server/errors.py
@@ -1,25 +1,25 @@
 from __future__ import annotations
 
 import sys
-import traceback
 import time
+import traceback
 from re import compile, Match, Pattern
-from typing import Callable, Coroutine, Optional, Tuple, Union, Dict
-from typing_extensions import TypedDict
-
+from re import Match, Pattern, compile
+from typing import Callable, Coroutine, Dict, Optional, Tuple, Union
 
 from fastapi import (
+    HTTPException,
     Request,
     Response,
-    HTTPException,
 )
 from fastapi.responses import JSONResponse
 from fastapi.routing import APIRoute
+from typing_extensions import TypedDict
 
 from llama_cpp.server.types import (
+    CreateChatCompletionRequest,
     CreateCompletionRequest,
     CreateEmbeddingRequest,
-    CreateChatCompletionRequest,
 )
 
 
@@ -46,7 +46,7 @@ class ErrorResponseFormatters:
 
     @staticmethod
     def context_length_exceeded(
-        request: Union["CreateCompletionRequest", "CreateChatCompletionRequest"],
+        request: Union[CreateCompletionRequest, CreateChatCompletionRequest],
         match,  # type: Match[str] # type: ignore
     ) -> Tuple[int, ErrorResponse]:
         """Formatter for context length exceeded error"""
@@ -84,7 +84,7 @@ def context_length_exceeded(
 
     @staticmethod
     def model_not_found(
-        request: Union["CreateCompletionRequest", "CreateChatCompletionRequest"],
+        request: Union[CreateCompletionRequest, CreateChatCompletionRequest],
         match,  # type: Match[str] # type: ignore
     ) -> Tuple[int, ErrorResponse]:
         """Formatter for model_not_found error"""
@@ -105,11 +105,11 @@ class RouteErrorHandler(APIRoute):
     # key: regex pattern for original error message from llama_cpp
     # value: formatter function
     pattern_and_formatters: Dict[
-        "Pattern[str]",
+        Pattern[str],
         Callable[
             [
-                Union["CreateCompletionRequest", "CreateChatCompletionRequest"],
-                "Match[str]",
+                Union[CreateCompletionRequest, CreateChatCompletionRequest],
+                Match[str],
             ],
             Tuple[int, ErrorResponse],
         ],
@@ -127,14 +127,14 @@ def error_message_wrapper(
         error: Exception,
         body: Optional[
             Union[
-                "CreateChatCompletionRequest",
-                "CreateCompletionRequest",
-                "CreateEmbeddingRequest",
+                CreateChatCompletionRequest,
+                CreateCompletionRequest,
+                CreateEmbeddingRequest,
             ]
         ] = None,
     ) -> Tuple[int, ErrorResponse]:
         """Wraps error message in OpenAI style error response"""
-        print(f"Exception: {str(error)}", file=sys.stderr)
+        print(f"Exception: {error!s}", file=sys.stderr)
         traceback.print_exc(file=sys.stderr)
         if body is not None and isinstance(
             body,

From 63d0418dcdd8c24252513700c79e520e858176e4 Mon Sep 17 00:00:00 2001
From: Smartappli <Smartappli@users.noreply.github.com>
Date: Fri, 2 Aug 2024 15:42:42 +0000
Subject: [PATCH 021/177] style fixes by ruff

---
 llama_cpp/server/errors.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llama_cpp/server/errors.py b/llama_cpp/server/errors.py
index 51c0568f4..b898f41f5 100644
--- a/llama_cpp/server/errors.py
+++ b/llama_cpp/server/errors.py
@@ -4,7 +4,6 @@
 import time
 import traceback
 from re import compile, Match, Pattern
-from re import Match, Pattern, compile
 from typing import Callable, Coroutine, Dict, Optional, Tuple, Union
 
 from fastapi import (

From 8aae02619da3aec9f513f7cfa8abfcdd33f530d5 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 17:55:10 +0200
Subject: [PATCH 022/177] Lint

---
 llama_cpp/_internals.py | 116 +++++++++++++++++++---------------------
 1 file changed, 56 insertions(+), 60 deletions(-)

diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index dcd4e17ff..ab056caa6 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -1,26 +1,23 @@
 from __future__ import annotations
 
-import os
 import ctypes
-
+import os
+from contextlib import ExitStack
+from dataclasses import dataclass, field
 from typing import (
     Dict,
     List,
     Optional,
     Sequence,
 )
-from dataclasses import dataclass, field
-from contextlib import ExitStack
 
 import numpy as np
 import numpy.typing as npt
 
-from .llama_types import *
-from .llama_grammar import LlamaGrammar
+from llama_cpp import llama_cpp
 from ._utils import suppress_stdout_stderr
-
-import llama_cpp.llama_cpp as llama_cpp
-
+from .llama_grammar import LlamaGrammar
+from .llama_types import *
 
 # Python wrappers over llama.h structs
 
@@ -351,7 +348,7 @@ def get_state_size(self) -> int:
 
     # TODO: llama_save_session_file
 
-    def decode(self, batch: "_LlamaBatch"):
+    def decode(self, batch: _LlamaBatch):
         assert self.ctx is not None
         assert batch.batch is not None
         return_code = llama_cpp.llama_decode(
@@ -385,8 +382,8 @@ def set_rng_seed(self, seed: int):
 
     def sample_repetition_penalties(
         self,
-        candidates: "_LlamaTokenDataArray",
-        last_tokens_data: "llama_cpp.Array[llama_cpp.llama_token]",
+        candidates: _LlamaTokenDataArray,
+        last_tokens_data: llama_cpp.Array[llama_cpp.llama_token],
         penalty_last_n: int,
         penalty_repeat: float,
         penalty_freq: float,
@@ -403,33 +400,33 @@ def sample_repetition_penalties(
             penalty_present,
         )
 
-    def sample_softmax(self, candidates: "_LlamaTokenDataArray"):
+    def sample_softmax(self, candidates: _LlamaTokenDataArray):
         assert self.ctx is not None
         llama_cpp.llama_sample_softmax(
             self.ctx,
             llama_cpp.byref(candidates.candidates),
         )
 
-    def sample_top_k(self, candidates: "_LlamaTokenDataArray", k: int, min_keep: int):
+    def sample_top_k(self, candidates: _LlamaTokenDataArray, k: int, min_keep: int):
         assert self.ctx is not None
         llama_cpp.llama_sample_top_k(
             self.ctx, llama_cpp.byref(candidates.candidates), k, min_keep
         )
 
-    def sample_top_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int):
+    def sample_top_p(self, candidates: _LlamaTokenDataArray, p: float, min_keep: int):
         assert self.ctx is not None
         llama_cpp.llama_sample_top_p(
             self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep
         )
 
-    def sample_min_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int):
+    def sample_min_p(self, candidates: _LlamaTokenDataArray, p: float, min_keep: int):
         assert self.ctx is not None
         llama_cpp.llama_sample_min_p(
             self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep
         )
 
     def sample_tail_free(
-        self, candidates: "_LlamaTokenDataArray", z: float, min_keep: int
+        self, candidates: _LlamaTokenDataArray, z: float, min_keep: int
     ):
         assert self.ctx is not None
         llama_cpp.llama_sample_tail_free(
@@ -437,20 +434,20 @@ def sample_tail_free(
         )
 
     def sample_typical(
-        self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int
+        self, candidates: _LlamaTokenDataArray, p: float, min_keep: int
     ):
         assert self.ctx is not None
         llama_cpp.llama_sample_typical(
             self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep
         )
 
-    def sample_temp(self, candidates: "_LlamaTokenDataArray", temp: float):
+    def sample_temp(self, candidates: _LlamaTokenDataArray, temp: float):
         assert self.ctx is not None
         llama_cpp.llama_sample_temp(
             self.ctx, llama_cpp.byref(candidates.candidates), temp
         )
 
-    def sample_grammar(self, candidates: "_LlamaTokenDataArray", grammar: LlamaGrammar):
+    def sample_grammar(self, candidates: _LlamaTokenDataArray, grammar: LlamaGrammar):
         assert self.ctx is not None
         assert grammar.grammar is not None
         llama_cpp.llama_sample_grammar(
@@ -461,7 +458,7 @@ def sample_grammar(self, candidates: "_LlamaTokenDataArray", grammar: LlamaGramm
 
     def sample_token_mirostat(
         self,
-        candidates: "_LlamaTokenDataArray",
+        candidates: _LlamaTokenDataArray,
         tau: float,
         eta: float,
         m: int,
@@ -493,14 +490,14 @@ def sample_token_mirostat_v2(
             mu,
         )
 
-    def sample_token_greedy(self, candidates: "_LlamaTokenDataArray") -> int:
+    def sample_token_greedy(self, candidates: _LlamaTokenDataArray) -> int:
         assert self.ctx is not None
         return llama_cpp.llama_sample_token_greedy(
             self.ctx,
             llama_cpp.byref(candidates.candidates),
         )
 
-    def sample_token(self, candidates: "_LlamaTokenDataArray") -> int:
+    def sample_token(self, candidates: _LlamaTokenDataArray) -> int:
         assert self.ctx is not None
         return llama_cpp.llama_sample_token(
             self.ctx,
@@ -822,44 +819,43 @@ def sample(
             id = token_data_array.candidates_data.id[0]
         elif self.params.temp == 0:
             id = ctx_main.sample_token_greedy(token_data_array)
+        elif self.params.mirostat == 1:
+            mirostat_m = 100
+            ctx_main.sample_temp(token_data_array, self.params.temp)
+            id = ctx_main.sample_token_mirostat(
+                token_data_array,
+                self.params.mirostat_tau,
+                self.params.mirostat_eta,
+                mirostat_m,
+                ctypes.pointer(self.mirostat_mu),
+            )
+        elif self.params.mirostat == 2:
+            ctx_main.sample_temp(token_data_array, self.params.temp)
+            id = ctx_main.sample_token_mirostat_v2(
+                token_data_array,
+                self.params.mirostat_tau,
+                self.params.mirostat_eta,
+                ctypes.pointer(self.mirostat_mu),
+            )     
         else:
-            if self.params.mirostat == 1:
-                mirostat_m = 100
-                ctx_main.sample_temp(token_data_array, self.params.temp)
-                id = ctx_main.sample_token_mirostat(
-                    token_data_array,
-                    self.params.mirostat_tau,
-                    self.params.mirostat_eta,
-                    mirostat_m,
-                    ctypes.pointer(self.mirostat_mu),
-                )
-            elif self.params.mirostat == 2:
-                ctx_main.sample_temp(token_data_array, self.params.temp)
-                id = ctx_main.sample_token_mirostat_v2(
-                    token_data_array,
-                    self.params.mirostat_tau,
-                    self.params.mirostat_eta,
-                    ctypes.pointer(self.mirostat_mu),
-                )
-            else:
-                min_keep = max(1, self.params.n_probs)
-                ctx_main.sample_top_k(
-                    token_data_array, self.params.top_k, min_keep=min_keep
-                )
-                ctx_main.sample_tail_free(
-                    token_data_array, self.params.tfs_z, min_keep=min_keep
-                )
-                ctx_main.sample_typical(
-                    token_data_array, self.params.typical_p, min_keep=min_keep
-                )
-                ctx_main.sample_top_p(
-                    token_data_array, self.params.top_p, min_keep=min_keep
-                )
-                ctx_main.sample_min_p(
-                    token_data_array, self.params.min_p, min_keep=min_keep
-                )
-                ctx_main.sample_temp(token_data_array, self.params.temp)
-                id = ctx_main.sample_token(token_data_array)
+            min_keep = max(1, self.params.n_probs)
+            ctx_main.sample_top_k(
+                token_data_array, self.params.top_k, min_keep=min_keep
+            )
+            ctx_main.sample_tail_free(
+                token_data_array, self.params.tfs_z, min_keep=min_keep
+            )
+            ctx_main.sample_typical(
+                token_data_array, self.params.typical_p, min_keep=min_keep
+            )
+            ctx_main.sample_top_p(
+                token_data_array, self.params.top_p, min_keep=min_keep
+            )
+            ctx_main.sample_min_p(
+                token_data_array, self.params.min_p, min_keep=min_keep
+            )
+            ctx_main.sample_temp(token_data_array, self.params.temp)
+            id = ctx_main.sample_token(token_data_array)
         return id
 
     def accept(self, ctx_main: _LlamaContext, id: int, apply_grammar: bool):

From 567625f528315dce8aabf1a9df69bd8819b4476d Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 18:00:28 +0200
Subject: [PATCH 023/177] Lint

---
 .../low_level_api/low_level_api_chat_cpp.py   | 118 +++++++++---------
 1 file changed, 59 insertions(+), 59 deletions(-)

diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py
index 347427576..5715f3f40 100644
--- a/examples/low_level_api/low_level_api_chat_cpp.py
+++ b/examples/low_level_api/low_level_api_chat_cpp.py
@@ -13,12 +13,13 @@
 
 import ctypes
 import sys
-from time import time
 from os import cpu_count, path
+from time import time
 
-import llama_cpp
-from common import GptParams, gpt_params_parse, gpt_random_prompt
 import util
+from common import GptParams, gpt_params_parse, gpt_random_prompt
+
+import llama_cpp
 
 
 # A LLaMA interactive session
@@ -475,63 +476,62 @@ def generate(self):
                 if self.params.temp <= 0:
                     # Greedy sampling
                     id = llama_cpp.llama_sample_token_greedy(self.ctx, candidates_p)
+                elif self.params.mirostat == 1:
+                    mirostat_mu = 2.0 * self.params.mirostat_tau
+                    mirostat_m = 100
+                    llama_cpp.llama_sample_temperature(
+                        self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)
+                    )
+                    id = llama_cpp.llama_sample_token_mirostat(
+                        self.ctx,
+                        candidates_p,
+                        llama_cpp.c_float(self.params.mirostat_tau),
+                        llama_cpp.c_float(self.params.mirostat_eta),
+                        llama_cpp.c_int(mirostat_m),
+                        llama_cpp.c_float(mirostat_mu),
+                    )
+                elif self.params.mirostat == 2:
+                    mirostat_mu = 2.0 * self.params.mirostat_tau
+                    llama_cpp.llama_sample_temperature(
+                        self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)
+                    )
+                    id = llama_cpp.llama_sample_token_mirostat_v2(
+                        self.ctx,
+                        candidates_p,
+                        llama_cpp.c_float(self.params.mirostat_tau),
+                        llama_cpp.c_float(self.params.mirostat_eta),
+                        llama_cpp.c_float(mirostat_mu),
+                    )
                 else:
-                    if self.params.mirostat == 1:
-                        mirostat_mu = 2.0 * self.params.mirostat_tau
-                        mirostat_m = 100
-                        llama_cpp.llama_sample_temperature(
-                            self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)
-                        )
-                        id = llama_cpp.llama_sample_token_mirostat(
-                            self.ctx,
-                            candidates_p,
-                            llama_cpp.c_float(self.params.mirostat_tau),
-                            llama_cpp.c_float(self.params.mirostat_eta),
-                            llama_cpp.c_int(mirostat_m),
-                            llama_cpp.c_float(mirostat_mu),
-                        )
-                    elif self.params.mirostat == 2:
-                        mirostat_mu = 2.0 * self.params.mirostat_tau
-                        llama_cpp.llama_sample_temperature(
-                            self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)
-                        )
-                        id = llama_cpp.llama_sample_token_mirostat_v2(
-                            self.ctx,
-                            candidates_p,
-                            llama_cpp.c_float(self.params.mirostat_tau),
-                            llama_cpp.c_float(self.params.mirostat_eta),
-                            llama_cpp.c_float(mirostat_mu),
-                        )
-                    else:
-                        # Temperature sampling
-                        llama_cpp.llama_sample_top_k(
-                            self.ctx,
-                            candidates_p,
-                            top_k,
-                            min_keep=llama_cpp.c_size_t(1),
-                        )
-                        llama_cpp.llama_sample_tail_free(
-                            self.ctx,
-                            candidates_p,
-                            llama_cpp.c_float(self.params.tfs_z),
-                            min_keep=llama_cpp.c_size_t(1),
-                        )
-                        llama_cpp.llama_sample_typical(
-                            self.ctx,
-                            candidates_p,
-                            llama_cpp.c_float(self.params.typical_p),
-                            min_keep=llama_cpp.c_size_t(1),
-                        )
-                        llama_cpp.llama_sample_top_p(
-                            self.ctx,
-                            candidates_p,
-                            llama_cpp.c_float(self.params.top_p),
-                            min_keep=llama_cpp.c_size_t(1),
-                        )
-                        llama_cpp.llama_sample_temperature(
-                            self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)
-                        )
-                        id = llama_cpp.llama_sample_token(self.ctx, candidates_p)
+                    # Temperature sampling
+                    llama_cpp.llama_sample_top_k(
+                        self.ctx,
+                        candidates_p,
+                        top_k,
+                        min_keep=llama_cpp.c_size_t(1),
+                    )
+                    llama_cpp.llama_sample_tail_free(
+                        self.ctx,
+                        candidates_p,
+                        llama_cpp.c_float(self.params.tfs_z),
+                        min_keep=llama_cpp.c_size_t(1),
+                    )
+                    llama_cpp.llama_sample_typical(
+                        self.ctx,
+                        candidates_p,
+                        llama_cpp.c_float(self.params.typical_p),
+                        min_keep=llama_cpp.c_size_t(1),
+                    )
+                    llama_cpp.llama_sample_top_p(
+                        self.ctx,
+                        candidates_p,
+                        llama_cpp.c_float(self.params.top_p),
+                        min_keep=llama_cpp.c_size_t(1),
+                    )
+                    llama_cpp.llama_sample_temperature(
+                        self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)
+                    )
+                    id = llama_cpp.llama_sample_token(self.ctx, candidates_p)
                 # print("`{}`".format(candidates_p.size))
 
                 self.last_n_tokens.pop(0)

From 8c5817f040d69dbec92fc9917c1bdc3cb61037c6 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 18:01:38 +0200
Subject: [PATCH 024/177] Lint

---
 examples/low_level_api/ReasonAct.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/low_level_api/ReasonAct.py b/examples/low_level_api/ReasonAct.py
index eb3043bdb..20bd33bfa 100644
--- a/examples/low_level_api/ReasonAct.py
+++ b/examples/low_level_api/ReasonAct.py
@@ -1,6 +1,7 @@
 #!/bin/python
-import sys
 import os
+import sys
+
 from common import GptParams
 from low_level_api_chat_cpp import LLaMAInteract
 

From 28c539d782cc48eaa30abc5db3dfbde43f103e3c Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 18:02:35 +0200
Subject: [PATCH 025/177] Lint

---
 examples/low_level_api/Chat.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/low_level_api/Chat.py b/examples/low_level_api/Chat.py
index eba3ba33f..fcbc79c69 100644
--- a/examples/low_level_api/Chat.py
+++ b/examples/low_level_api/Chat.py
@@ -1,7 +1,8 @@
 #!/bin/python
-import sys
-import os
 import datetime
+import os
+import sys
+
 from common import GptParams
 from low_level_api_chat_cpp import LLaMAInteract
 

From 4c147386eb024c1151702e6d6e5a8194c8332d08 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 18:03:42 +0200
Subject: [PATCH 026/177] Lint

---
 examples/gradio_chat/local.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/gradio_chat/local.py b/examples/gradio_chat/local.py
index e16bf234a..617e340b2 100644
--- a/examples/gradio_chat/local.py
+++ b/examples/gradio_chat/local.py
@@ -1,8 +1,8 @@
+import gradio as gr
+
 import llama_cpp
 import llama_cpp.llama_tokenizer
 
-import gradio as gr
-
 llama = llama_cpp.Llama.from_pretrained(
     repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
     filename="*q8_0.gguf",

From d445fdb83398194928d5d4e78652a4817bc11a36 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 18:04:21 +0200
Subject: [PATCH 027/177] Lint

---
 examples/gradio_chat/server.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/gradio_chat/server.py b/examples/gradio_chat/server.py
index 52061bea6..34f4f7548 100644
--- a/examples/gradio_chat/server.py
+++ b/examples/gradio_chat/server.py
@@ -1,5 +1,4 @@
 import gradio as gr
-
 from openai import OpenAI
 
 client = OpenAI(base_url="http://localhost:8000/v1", api_key="llama.cpp")

From bd03dfb8070879dd6e3323321efa54b0082f7eeb Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 18:05:17 +0200
Subject: [PATCH 028/177] Lint

---
 llama_cpp/server/cli.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llama_cpp/server/cli.py b/llama_cpp/server/cli.py
index 3dd007676..b2b2ac6cd 100644
--- a/llama_cpp/server/cli.py
+++ b/llama_cpp/server/cli.py
@@ -1,8 +1,7 @@
 from __future__ import annotations
 
 import argparse
-
-from typing import List, Literal, Union, Any, Type, TypeVar
+from typing import Any, List, Literal, Type, TypeVar, Union
 
 from pydantic import BaseModel
 

From 074b5cec985ad30cd82deddac77b7f849ac781c2 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 18:09:19 +0200
Subject: [PATCH 029/177] Lint

---
 examples/ray/llm.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/ray/llm.py b/examples/ray/llm.py
index 2325dd303..7900571d1 100755
--- a/examples/ray/llm.py
+++ b/examples/ray/llm.py
@@ -1,7 +1,9 @@
-from starlette.requests import Request
 from typing import Dict
+
 from ray import serve
 from ray.serve import Application
+from starlette.requests import Request
+
 from llama_cpp import Llama
 
 

From 4bbb41c123aadc203fc387923dfe7cbe81ba95b1 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 18:10:21 +0200
Subject: [PATCH 030/177] Lint

---
 llama_cpp/server/errors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/server/errors.py b/llama_cpp/server/errors.py
index b898f41f5..dae9960ba 100644
--- a/llama_cpp/server/errors.py
+++ b/llama_cpp/server/errors.py
@@ -3,7 +3,7 @@
 import sys
 import time
 import traceback
-from re import compile, Match, Pattern
+from re import Match, Pattern, compile
 from typing import Callable, Coroutine, Dict, Optional, Tuple, Union
 
 from fastapi import (

From e7b7fc722eb2c84e63483b655dd6b7eae4c16227 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 18:12:04 +0200
Subject: [PATCH 031/177] Lint

---
 llama_cpp/_internals.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index ab056caa6..7b34182ac 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -15,6 +15,7 @@
 import numpy.typing as npt
 
 from llama_cpp import llama_cpp
+
 from ._utils import suppress_stdout_stderr
 from .llama_grammar import LlamaGrammar
 from .llama_types import *
@@ -476,7 +477,7 @@ def sample_token_mirostat(
 
     def sample_token_mirostat_v2(
         self,
-        candidates: "_LlamaTokenDataArray",
+        candidates: _LlamaTokenDataArray,
         tau: float,
         eta: float,
         mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float],

From e7774f04d1ce7a1d726d8c0192833b3acbffdd3c Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 18:21:47 +0200
Subject: [PATCH 032/177] Lint

---
 llama_cpp/server/app.py | 36 +++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index cd3255176..d53ceaf9c 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -1,49 +1,47 @@
 from __future__ import annotations
 
 import os
+import contextlib
 import json
 import typing
-import contextlib
-
-from threading import Lock
 from functools import partial
-from typing import Iterator, List, Optional, Union, Dict
+from threading import Lock
 
-import llama_cpp
+from typing import Dict, Iterator, List, Optional, Union
 
 import anyio
 from anyio.streams.memory import MemoryObjectSendStream
-from starlette.concurrency import run_in_threadpool, iterate_in_threadpool
-from fastapi import Depends, FastAPI, APIRouter, Request, HTTPException, status, Body
+from fastapi import APIRouter, Body, Depends, FastAPI, HTTPException, Request, status
 from fastapi.middleware import Middleware
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.security import HTTPBearer
 from sse_starlette.sse import EventSourceResponse
-from starlette_context.plugins import RequestIdPlugin  # type: ignore
+from starlette.concurrency import iterate_in_threadpool, run_in_threadpool
 from starlette_context.middleware import RawContextMiddleware
-
+from starlette_context.plugins import RequestIdPlugin  # type: ignore
+ 
+import llama_cpp
+from llama_cpp.server.errors import RouteErrorHandler
 from llama_cpp.server.model import (
     LlamaProxy,
 )
 from llama_cpp.server.settings import (
     ConfigFileSettings,
-    Settings,
     ModelSettings,
     ServerSettings,
+    Settings,
 )
 from llama_cpp.server.types import (
+    CreateChatCompletionRequest,
     CreateCompletionRequest,
     CreateEmbeddingRequest,
-    CreateChatCompletionRequest,
+    DetokenizeInputRequest,
+    DetokenizeInputResponse,
     ModelList,
+    TokenizeInputCountResponse,
     TokenizeInputRequest,
     TokenizeInputResponse,
-    TokenizeInputCountResponse,
-    DetokenizeInputRequest,
-    DetokenizeInputResponse,
 )
-from llama_cpp.server.errors import RouteErrorHandler
-
 
 router = APIRouter(route_class=RouteErrorHandler)
 
@@ -150,7 +148,7 @@ def create_app(
     set_llama_proxy(model_settings=model_settings)
 
     if server_settings.disable_ping_events:
-        set_ping_message_factory(lambda: bytes())
+        set_ping_message_factory(lambda: b"")
 
     return app
 
@@ -248,7 +246,7 @@ async def authenticate(
                     "schema": {
                         "type": "string",
                         "title": "Server Side Streaming response, when stream=True. "
-                        + "See SSE format: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format",  # noqa: E501
+                        + "See SSE format: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format",
                         "example": """data: {... see CreateCompletionResponse ...} \\n\\n data: ... \\n\\n ... data: [DONE]""",
                     }
                 },
@@ -386,7 +384,7 @@ async def create_embedding(
                     "schema": {
                         "type": "string",
                         "title": "Server Side Streaming response, when stream=True"
-                        + "See SSE format: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format",  # noqa: E501
+                        + "See SSE format: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format",
                         "example": """data: {... see CreateChatCompletionResponse ...} \\n\\n data: ... \\n\\n ... data: [DONE]""",
                     }
                 },

From d980fa973a5c66d59334abd3f70d4e7bff4b7ec4 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 18:26:13 +0200
Subject: [PATCH 033/177] Lint

---
 llama_cpp/llama_cpp.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 727195cf5..0d30d6be9 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -1,22 +1,22 @@
 from __future__ import annotations
 
-import sys
-import os
 import ctypes
 import functools
+import os
 import pathlib
-
+import sys
 from typing import (
+    TYPE_CHECKING,
     Any,
     Callable,
+    Generic,
     List,
-    Union,
     NewType,
     Optional,
-    TYPE_CHECKING,
     TypeVar,
-    Generic,
+    Union,
 )
+
 from typing_extensions import TypeAlias
 
 
@@ -1767,7 +1767,7 @@ def llama_kv_cache_view_init(
 # // Free a KV cache view. (use only for debugging purposes)
 # LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
 @ctypes_function("llama_kv_cache_view_free", [llama_kv_cache_view_p], None)
-def llama_kv_cache_view_free(view: "ctypes.pointer[llama_kv_cache_view]", /):  # type: ignore
+def llama_kv_cache_view_free(view: ctypes.pointer[llama_kv_cache_view], /):  # type: ignore
     """Free a KV cache view. (use only for debugging purposes)"""
     ...
 

From a55649c75ff4cc0aee19ce828fd739203cb8a2d4 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 18:36:57 +0200
Subject: [PATCH 034/177] Lint

---
 llama_cpp/llama.py | 63 ++++++++++++++++++++++------------------------
 1 file changed, 30 insertions(+), 33 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 6c39e932a..6d109a984 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -1,59 +1,56 @@
 from __future__ import annotations
 
+import contextlib
+import ctypes
+import fnmatch
+import json
+import multiprocessing
 import os
 import sys
-import uuid
 import time
-import json
-import ctypes
 import typing
-import fnmatch
+import uuid
 import warnings
-import contextlib
-import multiprocessing
+from collections import deque
+from pathlib import Path
 
 from typing import (
     Any,
+    Callable,
+    Deque,
+    Dict,
+    Generator,
+    Iterator,
     List,
     Literal,
     Optional,
-    Union,
-    Generator,
     Sequence,
-    Iterator,
-    Deque,
-    Callable,
-    Dict,
-)
-from collections import deque
-from pathlib import Path
-
-
-from .llama_types import *
-from .llama_grammar import LlamaGrammar
-from .llama_cache import (
-    BaseLlamaCache,  # type: ignore
+    Union,
 )
-from .llama_tokenizer import BaseLlamaTokenizer, LlamaTokenizer
-import llama_cpp.llama_cpp as llama_cpp
-import llama_cpp.llama_chat_format as llama_chat_format
-
-from llama_cpp.llama_speculative import LlamaDraftModel
 
 import numpy as np
 import numpy.typing as npt
 
+from llama_cpp import llama_chat_format, llama_cpp
+from llama_cpp.llama_speculative import LlamaDraftModel
+
 from ._internals import (
-    _LlamaModel,  # type: ignore
-    _LlamaContext,  # type: ignore
     _LlamaBatch,  # type: ignore
-    _LlamaTokenDataArray,  # type: ignore
-    _LlamaSamplingParams,  # type: ignore
+    _LlamaContext,  # type: ignore
+    _LlamaModel,  # type: ignore
     _LlamaSamplingContext,  # type: ignore
+    _LlamaSamplingParams,  # type: ignore
+    _LlamaTokenDataArray,  # type: ignore
     _normalize_embedding,  # type: ignore
 )
 from ._logger import set_verbose
 from ._utils import suppress_stdout_stderr
+from .llama_cache import (
+    BaseLlamaCache,  # type: ignore
+)
+from .llama_grammar import LlamaGrammar
+from .llama_tokenizer import BaseLlamaTokenizer, LlamaTokenizer
+from .llama_types import *
 
 
 class Llama:
@@ -1036,7 +1033,7 @@ def _create_completion(
         assert self._ctx is not None
         assert suffix is None or suffix.__class__ is str
 
-        completion_id: str = f"cmpl-{str(uuid.uuid4())}"
+        completion_id: str = f"cmpl-{uuid.uuid4()!s}"
         created: int = int(time.time())
         bos_token_id: int = self.token_bos()
         cls_token_id: int = self._model.token_cls()
@@ -2127,7 +2124,7 @@ def from_pretrained(
         local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
         cache_dir: Optional[Union[str, os.PathLike[str]]] = None,
         **kwargs: Any,
-    ) -> "Llama":
+    ) -> Llama:
         """Create a Llama model from a pretrained model name or path.
         This method requires the huggingface-hub package.
         You can install it with `pip install huggingface-hub`.
@@ -2142,7 +2139,7 @@ def from_pretrained(
         Returns:
             A Llama model."""
         try:
-            from huggingface_hub import hf_hub_download, HfFileSystem
+            from huggingface_hub import HfFileSystem, hf_hub_download
             from huggingface_hub.utils import validate_repo_id
         except ImportError:
             raise ImportError(

From 707d972d75f6fa5fa6a98dff0dbff71c1aaa5213 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 18:38:25 +0200
Subject: [PATCH 035/177] Lint

---
 llama_cpp/llama_types.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py
index bbb58afc3..a243962ba 100644
--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@@ -7,9 +7,9 @@
 
 """
 
-from typing import Any, List, Optional, Dict, Union
-from typing_extensions import TypedDict, NotRequired, Literal
-
+from typing import Any, Dict, List, Optional, Union
+ 
+from typing_extensions import Literal, NotRequired, TypedDict
 
 # NOTE: Defining this correctly using annotations seems to break pydantic validation.
 #       This is a workaround until we can figure out how to do this correctly

From aa68377d9612348193ad0c575dfd7bde749c1b0c Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 18:41:28 +0200
Subject: [PATCH 036/177] Lint

---
 llama_cpp/server/app.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index d53ceaf9c..ec7da0712 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -1,12 +1,11 @@
 from __future__ import annotations
 
-import os
 import contextlib
 import json
+import os
 import typing
 from functools import partial
 from threading import Lock
-
 from typing import Dict, Iterator, List, Optional, Union
 
 import anyio
@@ -19,7 +18,7 @@
 from starlette.concurrency import iterate_in_threadpool, run_in_threadpool
 from starlette_context.middleware import RawContextMiddleware
 from starlette_context.plugins import RequestIdPlugin  # type: ignore
- 
+
 import llama_cpp
 from llama_cpp.server.errors import RouteErrorHandler
 from llama_cpp.server.model import (

From d51124ae8936ef87ee3e5fe02f6f36425861e375 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 18:42:52 +0200
Subject: [PATCH 037/177] Lint

---
 llama_cpp/server/model.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index c486f8885..afc3f91df 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -1,13 +1,10 @@
 from __future__ import annotations
 
 import json
-
-from typing import Dict, Optional, Union, List
+from typing import Dict, List, Optional, Union
 
 import llama_cpp
-import llama_cpp.llama_speculative as llama_speculative
-import llama_cpp.llama_tokenizer as llama_tokenizer
-
+from llama_cpp import llama_speculative, llama_tokenizer
 from llama_cpp.server.settings import ModelSettings
 
 

From 1564d2419c00df8c59adb8b617b9f7c1dd42f844 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 18:43:29 +0200
Subject: [PATCH 038/177] Lint

---
 llama_cpp/llama.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 6d109a984..6c006e96b 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -13,7 +13,6 @@
 import warnings
 from collections import deque
 from pathlib import Path
-
 from typing import (
     Any,
     Callable,

From 3b23dbac898c90365d49d60e39c9cdeff1f9e632 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 18:45:12 +0200
Subject: [PATCH 039/177] Lint

---
 llama_cpp/server/types.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py
index fdd164456..e95ab11ac 100644
--- a/llama_cpp/server/types.py
+++ b/llama_cpp/server/types.py
@@ -1,13 +1,12 @@
 from __future__ import annotations
 
-from typing import List, Optional, Union, Dict
-from typing_extensions import TypedDict, Literal
+from typing import Dict, List, Optional, Union
 
 from pydantic import BaseModel, Field
+from typing_extensions import Literal, TypedDict
 
 import llama_cpp
 
-
 model_field = Field(
     description="The model to use for generating completions.", default=None
 )

From 03a8d12421b5000396be2b3b5149caeb59a6baaf Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 18:48:01 +0200
Subject: [PATCH 040/177] Lint

---
 llama_cpp/server/__main__.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index bbac4957e..f621d78af 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -24,20 +24,20 @@
 
 from __future__ import annotations
 
+import argparse
 import os
 import sys
-import argparse
 
 import uvicorn
 
 from llama_cpp.server.app import create_app
+from llama_cpp.server.cli import add_args_from_model, parse_model_from_args
 from llama_cpp.server.settings import (
-    Settings,
-    ServerSettings,
-    ModelSettings,
     ConfigFileSettings,
+    ModelSettings,
+    ServerSettings,
+    Settings,
 )
-from llama_cpp.server.cli import add_args_from_model, parse_model_from_args
 
 
 def main():
@@ -62,9 +62,10 @@ def main():
             with open(config_file, "rb") as f:
                 # Check if yaml file
                 if config_file.endswith(".yaml") or config_file.endswith(".yml"):
-                    import yaml
                     import json
 
+                    import yaml
+                    
                     config_file_settings = ConfigFileSettings.model_validate_json(
                         json.dumps(yaml.safe_load(f))
                     )

From 27dbd64785976a1d966320b648c08513b93c80d5 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 18:48:37 +0200
Subject: [PATCH 041/177] Lint

---
 llama_cpp/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index 97c8d8174..057df5d8a 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
-from .llama_cpp import *
 from .llama import *
+from .llama_cpp import *
 
 __version__ = "0.2.85"

From e211d7bde945d092befc770193f5f8d855a3a345 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 18:50:59 +0200
Subject: [PATCH 042/177] Lint

---
 llama_cpp/llama_tokenizer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llama_cpp/llama_tokenizer.py b/llama_cpp/llama_tokenizer.py
index 029bf2acc..6f235b658 100644
--- a/llama_cpp/llama_tokenizer.py
+++ b/llama_cpp/llama_tokenizer.py
@@ -2,9 +2,9 @@
 
 import abc
 from typing import (
+    Any,
     List,
     Optional,
-    Any,
 )
 
 import llama_cpp
@@ -62,7 +62,7 @@ def decode(self, tokens: List[int]) -> str:
         return self.detokenize(tokens).decode("utf-8", errors="ignore")
 
     @classmethod
-    def from_ggml_file(cls, path: str) -> "LlamaTokenizer":
+    def from_ggml_file(cls, path: str) -> LlamaTokenizer:
         return cls(llama_cpp.Llama(model_path=path, vocab_only=True))
 
 
@@ -92,7 +92,7 @@ def detokenize(
             return self.hf_tokenizer.decode(tokens).encode("utf-8", errors="ignore")
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: str) -> "LlamaHFTokenizer":
+    def from_pretrained(cls, pretrained_model_name_or_path: str) -> LlamaHFTokenizer:
         try:
             from transformers import AutoTokenizer
         except ImportError:

From 2ae259646deb0c46c9a69ecebfe7c793904743c8 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 18:51:52 +0200
Subject: [PATCH 043/177] Delete .github/workflows/fixer.yml

---
 .github/workflows/fixer.yml | 34 ----------------------------------
 1 file changed, 34 deletions(-)
 delete mode 100644 .github/workflows/fixer.yml

diff --git a/.github/workflows/fixer.yml b/.github/workflows/fixer.yml
deleted file mode 100644
index 7ea87c527..000000000
--- a/.github/workflows/fixer.yml
+++ /dev/null
@@ -1,34 +0,0 @@
-name: Fixer
-
-on: [push, pull_request]
-
-concurrency:
-  group: fixer-${{ github.event_name == 'pull_request' && format('{0}-{1}', github.workflow, github.event.pull_request.number) || github.workflow_ref }}
-  cancel-in-progress: true
-
-jobs:
-  ruff-lint:
-    name: Ruff
-    runs-on: ubuntu-latest
-    permissions:
-      # Give the default GITHUB_TOKEN write permission to commit and push the
-      # added or changed files to the repository.
-      contents: write
-
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          ref: ${{ github.head_ref }}
-  
-      - uses: actions/setup-python@v5
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-      
-      - uses: chartboost/ruff-action@v1
-        with:
-          args: 'check --fix-only'
-
-      - uses: stefanzweifel/git-auto-commit-action@v5
-        with:
-          commit_message: 'style fixes by ruff'

From 22865651d199fe781a29f4c5facce338553bf6c2 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 18:58:20 +0200
Subject: [PATCH 044/177] Lint

---
 llama_cpp/llava_cpp.py | 37 +++++++++++++++++++------------------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/llama_cpp/llava_cpp.py b/llama_cpp/llava_cpp.py
index b80d85913..dbf8ad19c 100644
--- a/llama_cpp/llava_cpp.py
+++ b/llama_cpp/llava_cpp.py
@@ -1,35 +1,36 @@
 from __future__ import annotations
 
-import sys
-import os
 import ctypes
 import functools
+import os
+import pathlib
+import sys
 from ctypes import (
+    POINTER,
+    Structure,
+    _Pointer,  # type: ignore
     c_bool,
     c_char_p,
+    c_float,
     c_int,
     c_uint8,
-    c_float,
     c_void_p,
-    POINTER,
-    _Pointer,  # type: ignore
-    Structure,
 )
-import pathlib
 from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Generic,
     List,
-    Union,
     NewType,
     Optional,
     TypeVar,
-    Callable,
-    Any,
-    TYPE_CHECKING,
-    Generic,
+    Union,
 )
+
 from typing_extensions import TypeAlias
 
-import llama_cpp.llama_cpp as llama_cpp
+from llama_cpp import llama_cpp
 
 
 # Load the library
@@ -181,7 +182,7 @@ def llava_image_embed_make_with_bytes(
     image_bytes: CtypesArray[c_uint8],
     image_bytes_length: Union[c_int, int],
     /,
-) -> "_Pointer[llava_image_embed]": ...
+) -> _Pointer[llava_image_embed]: ...
 
 
 # /** build an image embed from a path to an image filename */
@@ -193,13 +194,13 @@ def llava_image_embed_make_with_bytes(
 )
 def llava_image_embed_make_with_filename(
     ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], image_path: bytes, /
-) -> "_Pointer[llava_image_embed]": ...
+) -> _Pointer[llava_image_embed]: ...
 
 
 # LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
 # /** free an embedding made with llava_image_embed_make_* */
 @ctypes_function("llava_image_embed_free", [POINTER(llava_image_embed)], None)
-def llava_image_embed_free(embed: "_Pointer[llava_image_embed]", /): ...
+def llava_image_embed_free(embed: _Pointer[llava_image_embed], /): ...
 
 
 # /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
@@ -216,9 +217,9 @@ def llava_image_embed_free(embed: "_Pointer[llava_image_embed]", /): ...
 )
 def llava_eval_image_embed(
     ctx_llama: llama_cpp.llama_context_p,
-    embed: "_Pointer[llava_image_embed]",
+    embed: _Pointer[llava_image_embed],
     n_batch: Union[c_int, int],
-    n_past: "_Pointer[c_int]",
+    n_past: _Pointer[c_int],
     /,
 ) -> bool: ...
 

From 7481f9608885c05ac41d15c81955532bf08705c5 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 19:18:20 +0200
Subject: [PATCH 045/177] Lint

---
 llama_cpp/llama_chat_format.py | 205 ++++++++++++++++-----------------
 1 file changed, 99 insertions(+), 106 deletions(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index ea8d07feb..a163a8465 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -1,13 +1,12 @@
 from __future__ import annotations
 
-import os
-import sys
-import json
 import ctypes
 import dataclasses
+import json
+import os
 import random
 import string
-
+import sys
 from contextlib import ExitStack
 from typing import (
     Any,
@@ -16,24 +15,21 @@
     List,
     Literal,
     Optional,
+    Protocol,
     Tuple,
     Union,
-    Protocol,
     cast,
 )
 
 import jinja2
-from jinja2.sandbox import ImmutableSandboxedEnvironment
-
 import numpy as np
 import numpy.typing as npt
+from jinja2.sandbox import ImmutableSandboxedEnvironment
 
-import llama_cpp.llama as llama
-import llama_cpp.llama_types as llama_types
-import llama_cpp.llama_grammar as llama_grammar
+from llama_cpp import llama, llama_grammar, llama_types
 
 from ._logger import logger
-from ._utils import suppress_stdout_stderr, Singleton
+from ._utils import Singleton, suppress_stdout_stderr
 
 ### Common Chat Templates and Special Tokens ###
 
@@ -1478,7 +1474,7 @@ def generate_schema_from_functions(functions, namespace="functions") -> str:
                 schema += f"    {param_name}{optional_indicator}: {param_type},\n"
             schema += "  }) => any;\n\n"
 
-        schema += "}} // namespace {}\n".format(namespace)
+        schema += f"}} // namespace {namespace}\n"
         return schema
 
     def prepare_messages_for_inference(
@@ -1858,7 +1854,7 @@ def generate_schema_from_functions(functions, namespace="functions") -> str:
                 schema += f"{param_name}{optional_indicator}: {param_type},\n"
             schema += "}) => any;\n\n"
 
-        schema += "}} // namespace {}".format(namespace)
+        schema += f"}} // namespace {namespace}"
         return schema
 
     def prepare_messages_for_inference(
@@ -1876,26 +1872,25 @@ def prepare_messages_for_inference(
                     role="system", content=generate_schema_from_functions([])
                 )
             )
-        else:
-            if functions is not None:
-                all_messages.append(
-                    llama_types.ChatCompletionRequestSystemMessage(
-                        role="system", content=generate_schema_from_functions(functions)
-                    )
-                )
-            elif tools is not None and tool_choice != "none":
-                all_messages.append(
-                    llama_types.ChatCompletionRequestSystemMessage(
-                        role="system",
-                        content=generate_schema_from_functions(
-                            [
-                                tool["function"]
-                                for tool in tools
-                                if tool["type"] == "function"
-                            ]
-                        ),
-                    )
+        elif functions is not None:
+            all_messages.append(
+                llama_types.ChatCompletionRequestSystemMessage(
+                    role="system", content=generate_schema_from_functions(functions)
                 )
+            )
+        elif tools is not None and tool_choice != "none":
+            all_messages.append(
+                llama_types.ChatCompletionRequestSystemMessage(
+                    role="system",
+                    content=generate_schema_from_functions(
+                        [
+                            tool["function"]
+                            for tool in tools
+                            if tool["type"] == "function"
+                        ]
+                    ),
+                 )
+            )
 
         all_messages.append(
             llama_types.ChatCompletionRequestSystemMessage(
@@ -2483,88 +2478,86 @@ def generate_streaming(tools, functions, function_call, prompt):
             # If the prompt involves a function call, just append generated parameters to function_bodies
             else:
                 function_bodies.append(completion_text.strip())
-        else:
-            # If tool_choice/function_call is provided
-            if isinstance(function_call, dict):
-                prompt += f"{function_call['name']}\n{CONTENT_TOKEN}"
-                function_call = function_call["name"]
-                function_calls.append(function_call)
-                grammar = get_grammar(function_call)
-                stops = [STOP_TOKEN, FROM_TOKEN]
+        elif isinstance(function_call, dict):
+            prompt += f"{function_call['name']}\n{CONTENT_TOKEN}"
+            function_call = function_call["name"]
+            function_calls.append(function_call)
+            grammar = get_grammar(function_call)
+            stops = [STOP_TOKEN, FROM_TOKEN]
+            completion = create_completion(
+                prompt=prompt, stop=stops, grammar=grammar
+            )
+            completion_text = completion["choices"][0]["text"]
+            completion_tokens += completion["usage"]["completion_tokens"]
+            function_bodies.append(completion_text.strip())
+        # If "auto" or no tool_choice/function_call
+        elif isinstance(function_call, str) and function_call == "auto":
+            while True:
+                # Generate function name first
+                grammar = None
+                stops = CONTENT_TOKEN
                 completion = create_completion(
                     prompt=prompt, stop=stops, grammar=grammar
                 )
                 completion_text = completion["choices"][0]["text"]
                 completion_tokens += completion["usage"]["completion_tokens"]
-                function_bodies.append(completion_text.strip())
-            # If "auto" or no tool_choice/function_call
-            elif isinstance(function_call, str) and function_call == "auto":
-                while True:
-                    # Generate function name first
-                    grammar = None
-                    stops = CONTENT_TOKEN
-                    completion = create_completion(
-                        prompt=prompt, stop=stops, grammar=grammar
-                    )
-                    completion_text = completion["choices"][0]["text"]
-                    completion_tokens += completion["usage"]["completion_tokens"]
-                    function_name = completion_text.strip()
-                    if function_name == "all":
-                        prompt += "all\n<|content|>"
+                function_name = completion_text.strip()
+                if function_name == "all":
+                    prompt += "all\n<|content|>"
+                else:
+                    function_call = completion_text.strip()
+                    prompt += f"{function_call}\n<|content|>"
+                    function_calls.append(function_call)
+                    grammar = get_grammar(function_call)
+                # Generate content
+                stops = [RECIPIENT_TOKEN, STOP_TOKEN]
+                completion = create_completion(
+                    prompt=prompt, stop=stops, grammar=grammar
+                )
+                completion_text = completion["choices"][0]["text"]
+                completion_tokens += completion["usage"]["completion_tokens"]
+                if function_name == "all":
+                    if completion_text.endswith("\n<|from|>assistant\n"):
+                        content += completion_text[: -len("\n<|from|>assistant\n")]
+                    if completion_text.endswith("\n<|from|> assistant\n"):
+                        content += completion_text[-len("\n<|from|> assistant\n")]               
                     else:
-                        function_call = completion_text.strip()
-                        prompt += f"{function_call}\n<|content|>"
-                        function_calls.append(function_call)
-                        grammar = get_grammar(function_call)
-                    # Generate content
-                    stops = [RECIPIENT_TOKEN, STOP_TOKEN]
+                        content += completion_text
+                    content = content.lstrip()
+                    # Check whether the model wants to generate another turn
+                    if (
+                        "<|from|> assistant" in completion_text
+                        or "<|from|>assistant" in completion_text
+                    ):
+                        if completion_text.endswith("\n<|from|>assistant\n"):
+                            cleaned_completion_text = completion_text[
+                                : -len("\n<|from|>assistant\n")
+                            ].strip()
+                        elif completion_text.endswith("\n<|from|> assistant\n"):
+                            cleaned_completion_text = completion_text[
+                                -len("\n<|from|> assistant\n")
+                            ].strip()
+                        else:
+                            cleaned_completion_text = completion_text.strip()
+                        prompt += f"{cleaned_completion_text}\n<|from|>assistant\n<|recipient|>"
+                    else:
+                        break
+                else:
+                    function_bodies.append(completion_text.strip())
+                    # Check whether the model wants to generate another turn
+                    prompt += completion_text.strip()
+                    grammar = None
                     completion = create_completion(
                         prompt=prompt, stop=stops, grammar=grammar
                     )
-                    completion_text = completion["choices"][0]["text"]
                     completion_tokens += completion["usage"]["completion_tokens"]
-                    if function_name == "all":
-                        if completion_text.endswith("\n<|from|>assistant\n"):
-                            content += completion_text[: -len("\n<|from|>assistant\n")]
-                        if completion_text.endswith("\n<|from|> assistant\n"):
-                            content += completion_text[-len("\n<|from|> assistant\n")]
-                        else:
-                            content += completion_text
-                        content = content.lstrip()
-                        # Check whether the model wants to generate another turn
-                        if (
-                            "<|from|> assistant" in completion_text
-                            or "<|from|>assistant" in completion_text
-                        ):
-                            if completion_text.endswith("\n<|from|>assistant\n"):
-                                cleaned_completion_text = completion_text[
-                                    : -len("\n<|from|>assistant\n")
-                                ].strip()
-                            elif completion_text.endswith("\n<|from|> assistant\n"):
-                                cleaned_completion_text = completion_text[
-                                    -len("\n<|from|> assistant\n")
-                                ].strip()
-                            else:
-                                cleaned_completion_text = completion_text.strip()
-                            prompt += f"{cleaned_completion_text}\n<|from|>assistant\n<|recipient|>"
-                        else:
-                            break
+                    if (
+                        "<|from|> assistant" in completion["choices"][0]["text"]
+                        or "<|from|>assistant" in completion["choices"][0]["text"]
+                    ):
+                        prompt += "\n<|from|>assistant\n<|recipient|>"
                     else:
-                        function_bodies.append(completion_text.strip())
-                        # Check whether the model wants to generate another turn
-                        prompt += completion_text.strip()
-                        grammar = None
-                        completion = create_completion(
-                            prompt=prompt, stop=stops, grammar=grammar
-                        )
-                        completion_tokens += completion["usage"]["completion_tokens"]
-                        if (
-                            "<|from|> assistant" in completion["choices"][0]["text"]
-                            or "<|from|>assistant" in completion["choices"][0]["text"]
-                        ):
-                            prompt += "\n<|from|>assistant\n<|recipient|>"
-                        else:
-                            break
+                        break
 
         assert "usage" in completion
         assert len(function_calls) == len(function_bodies)
@@ -2667,7 +2660,7 @@ class Llava15ChatHandler:
     )
 
     def __init__(self, clip_model_path: str, verbose: bool = True):
-        import llama_cpp.llava_cpp as llava_cpp
+        from llama_cpp import llava_cpp
 
         self.clip_model_path = clip_model_path
         self.verbose = verbose
@@ -2978,12 +2971,12 @@ def from_pretrained(
         local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
         cache_dir: Optional[Union[str, os.PathLike[str]]] = None,
         **kwargs: Any,
-    ) -> "Llava15ChatHandler":
+    ) -> Llava15ChatHandler:
         import fnmatch
         from pathlib import Path
 
         try:
-            from huggingface_hub import hf_hub_download, HfFileSystem  # type: ignore
+            from huggingface_hub import HfFileSystem, hf_hub_download  # type: ignore
             from huggingface_hub.utils import validate_repo_id  # type: ignore
         except ImportError:
             raise ImportError(

From 8ed98ae0817e3645ae80883048ef65b0e1df7c15 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 19:23:05 +0200
Subject: [PATCH 046/177] Update __init__.py

---
 llama_cpp/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index 057df5d8a..97c8d8174 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
-from .llama import *
 from .llama_cpp import *
+from .llama import *
 
 __version__ = "0.2.85"

From f2fbc900d14b06b522670398d5a6cf2ee336d1d8 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 19:24:28 +0200
Subject: [PATCH 047/177] Update __init__.py

---
 llama_cpp/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index 97c8d8174..6addf2a27 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
-from .llama_cpp import *
-from .llama import *
+from .llama_cpp import *  # noqa
+from .llama import *  # noqa
 
 __version__ = "0.2.85"

From 376fdb02bcebc4df87658aacff696030b9819e3b Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 19:34:29 +0200
Subject: [PATCH 048/177] Update linter.yml

---
 .github/workflows/linter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml
index c85ec6654..43bdba50d 100644
--- a/.github/workflows/linter.yml
+++ b/.github/workflows/linter.yml
@@ -24,4 +24,4 @@ jobs:
           cache: 'pip'
       - uses: chartboost/ruff-action@v1
         with:
-          args: 'check --select B,F,I,PERF,NPY,PL,RUF,S,SIM,UP --output-format github --diff --exclude docker,docs,scripts,tests,vendor'
+          args: 'check --select B,C4,E,F,I,PERF,NPY,PL,RUF,S,SIM,UP,W --output-format github --diff --exclude docker,docs,scripts,tests,vendor'

From 400679946a7e75910207d0b2f5a3827141f33b37 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 19:36:10 +0200
Subject: [PATCH 049/177] Lint

---
 llama_cpp/server/__main__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index f621d78af..e06c45812 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -65,7 +65,7 @@ def main():
                     import json
 
                     import yaml
-                    
+
                     config_file_settings = ConfigFileSettings.model_validate_json(
                         json.dumps(yaml.safe_load(f))
                     )

From 795465aa1b20f3f306bcb3b92946a47c79e4836f Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 19:36:47 +0200
Subject: [PATCH 050/177] Lint

---
 llama_cpp/server/settings.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
index cab184cd9..7976e2850 100644
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@@ -2,11 +2,11 @@
 
 import multiprocessing
 from typing import Dict, List, Literal, Optional, Union, cast
- 
+
 from pydantic import Field, model_validator
 from pydantic_settings import BaseSettings
 from typing_extensions import Self
- 
+
 import llama_cpp
 
 # Disable warning for model and model_alias settings

From d1f88ba9ea348225d77f141d477c5ffe9a3d3268 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 19:37:23 +0200
Subject: [PATCH 051/177] Lint

---
 llama_cpp/_internals.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index 7b34182ac..08d004f5a 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -837,7 +837,7 @@ def sample(
                 self.params.mirostat_tau,
                 self.params.mirostat_eta,
                 ctypes.pointer(self.mirostat_mu),
-            )     
+            )
         else:
             min_keep = max(1, self.params.n_probs)
             ctx_main.sample_top_k(

From 0c0f1dc9dfdedacbc10410a6561d12f26d521157 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 19:38:47 +0200
Subject: [PATCH 052/177] Lint

---
 llama_cpp/llama_types.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py
index a243962ba..909a8f779 100644
--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@@ -8,7 +8,7 @@
 """
 
 from typing import Any, Dict, List, Optional, Union
- 
+
 from typing_extensions import Literal, NotRequired, TypedDict
 
 # NOTE: Defining this correctly using annotations seems to break pydantic validation.

From 49e9aa7279ac7655e88c8dc8770f99ee435244fc Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 19:39:54 +0200
Subject: [PATCH 053/177] Lint

---
 llama_cpp/llama_chat_format.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index a163a8465..194fec07f 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -2520,7 +2520,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                     if completion_text.endswith("\n<|from|>assistant\n"):
                         content += completion_text[: -len("\n<|from|>assistant\n")]
                     if completion_text.endswith("\n<|from|> assistant\n"):
-                        content += completion_text[-len("\n<|from|> assistant\n")]               
+                        content += completion_text[-len("\n<|from|> assistant\n")]
                     else:
                         content += completion_text
                     content = content.lstrip()

From 0e94977db1d858fd022f590b4bffe5d21be9a52a Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 19:42:32 +0200
Subject: [PATCH 054/177] Lint

---
 llama_cpp/llama.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 6c006e96b..e3d093e3d 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -1299,12 +1299,10 @@ def logit_bias_processor(
                         token_offset = len(prompt_tokens) + returned_tokens
                         logits = self._scores[token_offset - 1, :]
                         current_logprobs = Llama.logits_to_logprobs(logits).tolist()
-                        sorted_logprobs = list(
-                            sorted(
+                        sorted_logprobs = sorted(
                                 zip(current_logprobs, range(len(current_logprobs))),
                                 reverse=True,
                             )
-                        )
                         top_logprob = {
                             self.detokenize([i]).decode(
                                 "utf-8", errors="ignore"
@@ -1438,12 +1436,10 @@ def logit_bias_processor(
                     token_offset = len(prompt_tokens) + returned_tokens - 1
                     logits = self._scores[token_offset, :]
                     current_logprobs = Llama.logits_to_logprobs(logits).tolist()
-                    sorted_logprobs = list(
-                        sorted(
+                    sorted_logprobs = sorted(
                             zip(current_logprobs, range(len(current_logprobs))),
                             reverse=True,
                         )
-                    )
                     top_logprob = {
                         self.detokenize([i]).decode("utf-8", errors="ignore"): logprob
                         for logprob, i in sorted_logprobs[:logprobs]
@@ -1571,11 +1567,9 @@ def logit_bias_processor(
                     )
                 )
                 tokens.append(token_str)
-                sorted_logprobs = list(
-                    sorted(
+                sorted_logprobs = sorted(
                         zip(logprobs_token, range(len(logprobs_token))), reverse=True
                     )
-                )
                 token_logprobs.append(logprobs_token[int(token)])
                 top_logprob: Optional[Dict[str, float]] = {
                     self.detokenize([i], prev_tokens=all_tokens[:idx]).decode(

From f81bd5e7cecfa6dfa764923c3f63f7ec9353fa60 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 19:51:16 +0200
Subject: [PATCH 055/177] Add YTT, COM, ANN. DTZ rules

---
 .github/workflows/linter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml
index 43bdba50d..55738aaf9 100644
--- a/.github/workflows/linter.yml
+++ b/.github/workflows/linter.yml
@@ -24,4 +24,4 @@ jobs:
           cache: 'pip'
       - uses: chartboost/ruff-action@v1
         with:
-          args: 'check --select B,C4,E,F,I,PERF,NPY,PL,RUF,S,SIM,UP,W --output-format github --diff --exclude docker,docs,scripts,tests,vendor'
+          args: 'check --select A,ANN,ASYNC,B,C4,COM,DTZ,E,EM,F,I,PERF,NPY,PL,RUF,S,SIM,UP,W,YTT --output-format github --diff --exclude docker,docs,scripts,tests,vendor'

From 089e4f4e586b6b70e5bf4c16258ca69788a3a105 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 19:52:55 +0200
Subject: [PATCH 056/177] Lint

---
 llama_cpp/llama_speculative.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/llama_speculative.py b/llama_cpp/llama_speculative.py
index ab77fe9e3..cfb69bfac 100644
--- a/llama_cpp/llama_speculative.py
+++ b/llama_cpp/llama_speculative.py
@@ -8,7 +8,7 @@
 class LlamaDraftModel(abc.ABC):
     @abc.abstractmethod
     def __call__(
-        self, input_ids: npt.NDArray[np.intc], /, **kwargs: Any
+        self, input_ids: npt.NDArray[np.intc], /, **kwargs: Any,
     ) -> npt.NDArray[np.intc]:
         raise NotImplementedError()
 
@@ -54,7 +54,7 @@ def find_candidate_pred_tokens(
         return np.array([], dtype=np.intc)
 
     def __call__(
-        self, input_ids: npt.NDArray[np.intc], /, **kwargs: Any
+        self, input_ids: npt.NDArray[np.intc], /, **kwargs: Any,
     ) -> npt.NDArray[np.intc]:
         return self.find_candidate_pred_tokens(
             input_ids=input_ids,

From 00cd30dacba2e11725fada016da01acab13920ed Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 19:53:39 +0200
Subject: [PATCH 057/177] Lint

---
 examples/hf_pull/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/hf_pull/main.py b/examples/hf_pull/main.py
index 3264cb15a..ed1046b08 100644
--- a/examples/hf_pull/main.py
+++ b/examples/hf_pull/main.py
@@ -5,7 +5,7 @@
     repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
     filename="*q8_0.gguf",
     tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
-        "Qwen/Qwen1.5-0.5B"
+        "Qwen/Qwen1.5-0.5B",
     ),
     verbose=False,
 )

From 86cf326f3ec68856822903283bb2c0987abdfcca Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 19:54:27 +0200
Subject: [PATCH 058/177] Lint

---
 examples/high_level_api/fastapi_server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/high_level_api/fastapi_server.py b/examples/high_level_api/fastapi_server.py
index 4d003bcc2..469aa0996 100644
--- a/examples/high_level_api/fastapi_server.py
+++ b/examples/high_level_api/fastapi_server.py
@@ -35,5 +35,5 @@
     app = create_app()
 
     uvicorn.run(
-        app, host=os.getenv("HOST", "localhost"), port=int(os.getenv("PORT", 8000))
+        app, host=os.getenv("HOST", "localhost"), port=int(os.getenv("PORT", 8000)),
     )

From 7b0dcbcfbda9b9c32aea4511cefed5a519bb29f2 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 19:57:32 +0200
Subject: [PATCH 059/177] Lint

---
 llama_cpp/server/settings.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
index 7976e2850..848b93869 100644
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@@ -17,7 +17,7 @@ class ModelSettings(BaseSettings):
     """Model settings used to load a Llama model."""
 
     model: str = Field(
-        description="The path to the model to use for generating completions."
+        description="The path to the model to use for generating completions.",
     )
     model_alias: Optional[str] = Field(
         default=None,
@@ -43,7 +43,7 @@ class ModelSettings(BaseSettings):
         description="Split layers across multiple GPUs in proportion.",
     )
     vocab_only: bool = Field(
-        default=False, description="Whether to only return the vocabulary."
+        default=False, description="Whether to only return the vocabulary.",
     )
     use_mmap: bool = Field(
         default=llama_cpp.llama_supports_mmap(),
@@ -63,11 +63,11 @@ class ModelSettings(BaseSettings):
     )
     # Context Params
     seed: int = Field(
-        default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random."
+        default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random.",
     )
     n_ctx: int = Field(default=2048, ge=0, description="The context size.")
     n_batch: int = Field(
-        default=512, ge=1, description="The batch size to use per eval."
+        default=512, ge=1, description="The batch size to use per eval.",
     )
     n_threads: int = Field(
         default=max(multiprocessing.cpu_count() // 2, 1),
@@ -80,11 +80,11 @@ class ModelSettings(BaseSettings):
         description="The number of threads to use when batch processing. Use -1 for max cpu threads",
     )
     rope_scaling_type: int = Field(
-        default=llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED
+        default=llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
     )
     rope_freq_base: float = Field(default=0.0, description="RoPE base frequency")
     rope_freq_scale: float = Field(
-        default=0.0, description="RoPE frequency scaling factor"
+        default=0.0, description="RoPE frequency scaling factor",
     )
     yarn_ext_factor: float = Field(default=-1.0)
     yarn_attn_factor: float = Field(default=1.0)
@@ -92,15 +92,15 @@ class ModelSettings(BaseSettings):
     yarn_beta_slow: float = Field(default=1.0)
     yarn_orig_ctx: int = Field(default=0)
     mul_mat_q: bool = Field(
-        default=True, description="if true, use experimental mul_mat_q kernels"
+        default=True, description="if true, use experimental mul_mat_q kernels",
     )
     logits_all: bool = Field(default=True, description="Whether to return logits.")
     embedding: bool = Field(default=False, description="Whether to use embeddings.")
     offload_kqv: bool = Field(
-        default=True, description="Whether to offload kqv to the GPU."
+        default=True, description="Whether to offload kqv to the GPU.",
     )
     flash_attn: bool = Field(
-        default=False, description="Whether to use flash attention."
+        default=False, description="Whether to use flash attention.",
     )
     # Sampling Params
     last_n_tokens_size: int = Field(
@@ -178,11 +178,11 @@ class ModelSettings(BaseSettings):
     )
     # Misc
     verbose: bool = Field(
-        default=True, description="Whether to print debug information."
+        default=True, description="Whether to print debug information.",
     )
 
     @model_validator(
-        mode="before"
+        mode="before",
     )  # pre=True to ensure this runs before any other validation
     def set_dynamic_defaults(self) -> Self:
         # If n_threads or n_threads_batch is -1, set it to multiprocessing.cpu_count()
@@ -202,10 +202,10 @@ class ServerSettings(BaseSettings):
     host: str = Field(default="localhost", description="Listen address")
     port: int = Field(default=8000, description="Listen port")
     ssl_keyfile: Optional[str] = Field(
-        default=None, description="SSL key file for HTTPS"
+        default=None, description="SSL key file for HTTPS",
     )
     ssl_certfile: Optional[str] = Field(
-        default=None, description="SSL certificate file for HTTPS"
+        default=None, description="SSL certificate file for HTTPS",
     )
     # FastAPI Settings
     api_key: Optional[str] = Field(

From 1e782f7ce7cd683d1e5a2d046778a2924b79960d Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 19:58:31 +0200
Subject: [PATCH 060/177] Lint

---
 examples/high_level_api/langchain_custom_llm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/high_level_api/langchain_custom_llm.py b/examples/high_level_api/langchain_custom_llm.py
index c01f7c387..adbae6ce4 100644
--- a/examples/high_level_api/langchain_custom_llm.py
+++ b/examples/high_level_api/langchain_custom_llm.py
@@ -37,7 +37,7 @@ def _identifying_params(self) -> Mapping[str, Any]:
 
 # Basic Q&A
 answer = llm(
-    "Question: What is the capital of France? Answer: ", stop=["Question:", "\n"]
+    "Question: What is the capital of France? Answer: ", stop=["Question:", "\n"],
 )
 print(f"Answer: {answer.strip()}")
 

From 9d8ecd1854ae7f141863680bb957ad4fc9977981 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 19:59:44 +0200
Subject: [PATCH 061/177] Lint

---
 examples/high_level_api/high_level_api_infill.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/high_level_api/high_level_api_infill.py b/examples/high_level_api/high_level_api_infill.py
index 282333e5a..c132ec632 100644
--- a/examples/high_level_api/high_level_api_infill.py
+++ b/examples/high_level_api/high_level_api_infill.py
@@ -33,5 +33,5 @@
     filtered = True
 
 print(
-    f"Fill-in-Middle completion{' (filtered)' if filtered else ''}:\n\n{args.prompt}\033[32m{response}\033[{'33' if filtered else '0'}m{args.suffix}\033[0m"
+    f"Fill-in-Middle completion{' (filtered)' if filtered else ''}:\n\n{args.prompt}\033[32m{response}\033[{'33' if filtered else '0'}m{args.suffix}\033[0m",
 )

From ff353bc244a859289f9b5224f0e8212191993e33 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 20:04:09 +0200
Subject: [PATCH 062/177] Lint

---
 examples/low_level_api/common.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/low_level_api/common.py b/examples/low_level_api/common.py
index e108b848f..b924ad13f 100644
--- a/examples/low_level_api/common.py
+++ b/examples/low_level_api/common.py
@@ -75,7 +75,7 @@ class GptParams:
 
 def gpt_params_parse(argv=None):
     parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
     )
     parser.add_argument(
         "-s",
@@ -102,7 +102,7 @@ def gpt_params_parse(argv=None):
         dest="n_predict",
     )
     parser.add_argument(
-        "--n_parts", type=int, default=-1, help="number of model parts", dest="n_parts"
+        "--n_parts", type=int, default=-1, help="number of model parts", dest="n_parts",
     )
     parser.add_argument(
         "-c",
@@ -143,10 +143,10 @@ def gpt_params_parse(argv=None):
         dest="ignore_eos",
     )
     parser.add_argument(
-        "--top_k", type=int, default=40, help="top-k sampling", dest="top_k"
+        "--top_k", type=int, default=40, help="top-k sampling", dest="top_k",
     )
     parser.add_argument(
-        "--top_p", type=float, default=0.95, help="top-p samplin", dest="top_p"
+        "--top_p", type=float, default=0.95, help="top-p samplin", dest="top_p",
     )
     parser.add_argument(
         "--tfs",
@@ -156,7 +156,7 @@ def gpt_params_parse(argv=None):
         dest="tfs_z",
     )
     parser.add_argument(
-        "--temp", type=float, default=0.80, help="temperature", dest="temp"
+        "--temp", type=float, default=0.80, help="temperature", dest="temp",
     )
     parser.add_argument(
         "--repeat_penalty",
@@ -217,7 +217,7 @@ def gpt_params_parse(argv=None):
         dest="model",
     )
     parser.add_argument(
-        "-p", "--prompt", type=str, default=None, help="initial prompt", dest="prompt"
+        "-p", "--prompt", type=str, default=None, help="initial prompt", dest="prompt",
     )
     parser.add_argument(
         "-f",
@@ -242,7 +242,7 @@ def gpt_params_parse(argv=None):
         dest="input_prefix",
     )
     parser.add_argument(
-        "--in-suffix", type=str, default="", help="append to input", dest="input_suffix"
+        "--in-suffix", type=str, default="", help="append to input", dest="input_suffix",
     )
     parser.add_argument(
         "-r",

From 1a40080d157f8124106602f1c4c289e4f1fa0ce0 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 20:05:22 +0200
Subject: [PATCH 063/177] Lint

---
 llama_cpp/llama_cache.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/llama_cache.py b/llama_cpp/llama_cache.py
index 000ce1581..8a9f18f37 100644
--- a/llama_cpp/llama_cache.py
+++ b/llama_cpp/llama_cache.py
@@ -41,7 +41,7 @@ def __contains__(self, key: Sequence[int]) -> bool:
 
     @abstractmethod
     def __setitem__(
-        self, key: Sequence[int], value: "llama_cpp.llama.LlamaState"
+        self, key: Sequence[int], value: "llama_cpp.llama.LlamaState",
     ) -> None:
         raise NotImplementedError
 
@@ -105,7 +105,7 @@ class LlamaDiskCache(BaseLlamaCache):
     """Cache for a llama.cpp model using disk."""
 
     def __init__(
-        self, cache_dir: str = ".cache/llama_cache", capacity_bytes: int = (2 << 30)
+        self, cache_dir: str = ".cache/llama_cache", capacity_bytes: int = (2 << 30),
     ):
         super().__init__(capacity_bytes)
         self.cache = diskcache.Cache(cache_dir)

From f7e3b2f395fc43f42bf9fbeb777999f6ba895764 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 20:10:23 +0200
Subject: [PATCH 064/177] Lint

---
 llama_cpp/_internals.py | 54 ++++++++++++++++++++---------------------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index 08d004f5a..fcc8de826 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -46,7 +46,7 @@ def __init__(
 
         with suppress_stdout_stderr(disable=verbose):
             self.model = llama_cpp.llama_load_model_from_file(
-                self.path_model.encode("utf-8"), self.params
+                self.path_model.encode("utf-8"), self.params,
             )
 
         if self.model is None:
@@ -192,13 +192,13 @@ def tokenize(self, text: bytes, add_bos: bool, special: bool):
         n_ctx = self.n_ctx_train()
         tokens = (llama_cpp.llama_token * n_ctx)()
         n_tokens = llama_cpp.llama_tokenize(
-            self.model, text, len(text), tokens, n_ctx, add_bos, special
+            self.model, text, len(text), tokens, n_ctx, add_bos, special,
         )
         if n_tokens < 0:
             n_tokens = abs(n_tokens)
             tokens = (llama_cpp.llama_token * n_tokens)()
             n_tokens = llama_cpp.llama_tokenize(
-                self.model, text, len(text), tokens, n_tokens, add_bos, special
+                self.model, text, len(text), tokens, n_tokens, add_bos, special,
             )
             if n_tokens < 0:
                 raise RuntimeError(
@@ -219,7 +219,7 @@ def detokenize(self, tokens: List[int], special: bool = False) -> bytes:
         buffer = (ctypes.c_char * size)()
         for token in tokens:
             n = llama_cpp.llama_token_to_piece(
-                self.model, llama_cpp.llama_token(token), buffer, size, 0, special
+                self.model, llama_cpp.llama_token(token), buffer, size, 0, special,
             )
             assert n <= size
             output += bytes(buffer[:n])
@@ -242,23 +242,23 @@ def metadata(self) -> Dict[str, str]:
         # iterate over model keys
         for i in range(llama_cpp.llama_model_meta_count(self.model)):
             nbytes = llama_cpp.llama_model_meta_key_by_index(
-                self.model, i, buffer, buffer_size
+                self.model, i, buffer, buffer_size,
             )
             if nbytes > buffer_size:
                 buffer_size = nbytes + 1
                 buffer = ctypes.create_string_buffer(buffer_size)
                 nbytes = llama_cpp.llama_model_meta_key_by_index(
-                    self.model, i, buffer, buffer_size
+                    self.model, i, buffer, buffer_size,
                 )
             key = buffer.value.decode("utf-8")
             nbytes = llama_cpp.llama_model_meta_val_str_by_index(
-                self.model, i, buffer, buffer_size
+                self.model, i, buffer, buffer_size,
             )
             if nbytes > buffer_size:
                 buffer_size = nbytes + 1
                 buffer = ctypes.create_string_buffer(buffer_size)
                 nbytes = llama_cpp.llama_model_meta_val_str_by_index(
-                    self.model, i, buffer, buffer_size
+                    self.model, i, buffer, buffer_size,
                 )
             value = buffer.value.decode("utf-8")
             metadata[key] = value
@@ -411,41 +411,41 @@ def sample_softmax(self, candidates: _LlamaTokenDataArray):
     def sample_top_k(self, candidates: _LlamaTokenDataArray, k: int, min_keep: int):
         assert self.ctx is not None
         llama_cpp.llama_sample_top_k(
-            self.ctx, llama_cpp.byref(candidates.candidates), k, min_keep
+            self.ctx, llama_cpp.byref(candidates.candidates), k, min_keep,
         )
 
     def sample_top_p(self, candidates: _LlamaTokenDataArray, p: float, min_keep: int):
         assert self.ctx is not None
         llama_cpp.llama_sample_top_p(
-            self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep
+            self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep,
         )
 
     def sample_min_p(self, candidates: _LlamaTokenDataArray, p: float, min_keep: int):
         assert self.ctx is not None
         llama_cpp.llama_sample_min_p(
-            self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep
+            self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep,
         )
 
     def sample_tail_free(
-        self, candidates: _LlamaTokenDataArray, z: float, min_keep: int
+        self, candidates: _LlamaTokenDataArray, z: float, min_keep: int,
     ):
         assert self.ctx is not None
         llama_cpp.llama_sample_tail_free(
-            self.ctx, llama_cpp.byref(candidates.candidates), z, min_keep
+            self.ctx, llama_cpp.byref(candidates.candidates), z, min_keep,
         )
 
     def sample_typical(
-        self, candidates: _LlamaTokenDataArray, p: float, min_keep: int
+        self, candidates: _LlamaTokenDataArray, p: float, min_keep: int,
     ):
         assert self.ctx is not None
         llama_cpp.llama_sample_typical(
-            self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep
+            self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep,
         )
 
     def sample_temp(self, candidates: _LlamaTokenDataArray, temp: float):
         assert self.ctx is not None
         llama_cpp.llama_sample_temp(
-            self.ctx, llama_cpp.byref(candidates.candidates), temp
+            self.ctx, llama_cpp.byref(candidates.candidates), temp,
         )
 
     def sample_grammar(self, candidates: _LlamaTokenDataArray, grammar: LlamaGrammar):
@@ -528,7 +528,7 @@ def default_params():
 
 class _LlamaBatch:
     def __init__(
-        self, *, n_tokens: int, embd: int, n_seq_max: int, verbose: bool = True
+        self, *, n_tokens: int, embd: int, n_seq_max: int, verbose: bool = True,
     ):
         self._n_tokens = n_tokens
         self.embd = embd
@@ -538,7 +538,7 @@ def __init__(
 
         self.batch = None
         self.batch = llama_cpp.llama_batch_init(
-            self._n_tokens, self.embd, self.n_seq_max
+            self._n_tokens, self.embd, self.n_seq_max,
         )
 
         def free_batch():
@@ -596,7 +596,7 @@ def __init__(self, *, n_vocab: int):
         self.candidates_data = np.recarray(
             (self.n_vocab,),
             dtype=np.dtype(
-                [("id", np.intc), ("logit", np.single), ("p", np.single)], align=True
+                [("id", np.intc), ("logit", np.single), ("p", np.single)], align=True,
             ),
         )
         self.candidates = llama_cpp.llama_token_data_array(
@@ -651,12 +651,12 @@ def _token_to_piece(model: _LlamaModel, token: int, special: bool = False) -> st
     assert model.model is not None
     result = (ctypes.c_char * 8)(0)
     n_tokens = llama_cpp.llama_token_to_piece(
-        model.model, token, result, 0, len(result), special
+        model.model, token, result, 0, len(result), special,
     )
     if n_tokens < 0:
         result = (ctypes.c_char * -n_tokens)(0)
         check = llama_cpp.llama_token_to_piece(
-            model.model, token, result, 0, len(result), special
+            model.model, token, result, 0, len(result), special,
         )
         if check != -n_tokens:
             raise RuntimeError(f"Failed to get piece: token={token}")
@@ -789,7 +789,7 @@ def sample(
             logits_array[token] += logit_bias
 
         token_data_array = _LlamaTokenDataArray(
-            n_vocab=n_vocab
+            n_vocab=n_vocab,
         )  # TODO: Only create this once
         token_data_array.copy_logits(logits_array)
 
@@ -841,19 +841,19 @@ def sample(
         else:
             min_keep = max(1, self.params.n_probs)
             ctx_main.sample_top_k(
-                token_data_array, self.params.top_k, min_keep=min_keep
+                token_data_array, self.params.top_k, min_keep=min_keep,
             )
             ctx_main.sample_tail_free(
-                token_data_array, self.params.tfs_z, min_keep=min_keep
+                token_data_array, self.params.tfs_z, min_keep=min_keep,
             )
             ctx_main.sample_typical(
-                token_data_array, self.params.typical_p, min_keep=min_keep
+                token_data_array, self.params.typical_p, min_keep=min_keep,
             )
             ctx_main.sample_top_p(
-                token_data_array, self.params.top_p, min_keep=min_keep
+                token_data_array, self.params.top_p, min_keep=min_keep,
             )
             ctx_main.sample_min_p(
-                token_data_array, self.params.min_p, min_keep=min_keep
+                token_data_array, self.params.min_p, min_keep=min_keep,
             )
             ctx_main.sample_temp(token_data_array, self.params.temp)
             id = ctx_main.sample_token(token_data_array)

From 954486610948fc55adfa03eff3f926893fe07c8f Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 20:16:14 +0200
Subject: [PATCH 065/177] Lint

---
 .../low_level_api/low_level_api_chat_cpp.py    | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py
index 5715f3f40..9cbf51dae 100644
--- a/examples/low_level_api/low_level_api_chat_cpp.py
+++ b/examples/low_level_api/low_level_api_chat_cpp.py
@@ -36,14 +36,14 @@ def __init__(self, params: GptParams) -> None:
             raise NotImplementedError(
                 """************
 please use the 'perplexity' tool for perplexity calculations
-************"""
+************""",
             )
 
         if self.params.embedding:
             raise NotImplementedError(
                 """************
 please use the 'embedding' tool for embedding calculations
-************"""
+************""",
             )
 
         if self.params.n_ctx > 2048:
@@ -81,7 +81,7 @@ def __init__(self, params: GptParams) -> None:
         self.lparams.use_mmap = self.params.use_mmap
 
         self.model = llama_cpp.llama_load_model_from_file(
-            self.params.model.encode("utf8"), self.lparams
+            self.params.model.encode("utf8"), self.lparams,
         )
 
         # Context Params.
@@ -183,7 +183,7 @@ def __init__(self, params: GptParams) -> None:
 
         if len(self.embd_inp) > self.n_ctx - 4:
             raise RuntimeError(
-                f"error: prompt is too long ({len(self.embd_inp)} tokens, max {self.params.n_ctx - 4})"
+                f"error: prompt is too long ({len(self.embd_inp)} tokens, max {self.params.n_ctx - 4})",
             )
 
         # debug message about similarity of saved session, if applicable
@@ -201,15 +201,15 @@ def __init__(self, params: GptParams) -> None:
                 print("session file has exact match for prompt!")
             elif self.n_matching_session_tokens < (len(self.embd_inp) / 2):
                 print(
-                    f"warning: session file has low similarity to prompt ({self.n_matching_session_tokens} / {len(self.embd_inp)} tokens); will mostly be reevaluated"
+                    f"warning: session file has low similarity to prompt ({self.n_matching_session_tokens} / {len(self.embd_inp)} tokens); will mostly be reevaluated",
                 )
             else:
                 print(
-                    f"session file matches {self.n_matching_session_tokens} / {len(self.embd_inp)} tokens of prompt"
+                    f"session file matches {self.n_matching_session_tokens} / {len(self.embd_inp)} tokens of prompt",
                 )
 
         self.need_to_save_session = len(
-            self.params.path_session
+            self.params.path_session,
         ) > 0 and self.n_matching_session_tokens < (len(self.embd_inp) * 3 / 4)
 
         # number of tokens to keep when resetting context
@@ -424,7 +424,7 @@ def generate(self):
                         self.ctx,
                         self.params.path_session.encode("utf8"),
                         (llama_cpp.llama_token * len(self.session_tokens))(
-                            *self.session_tokens
+                            *self.session_tokens,
                         ),
                         len(self.session_tokens),
                     )
@@ -442,7 +442,7 @@ def generate(self):
                     *[
                         llama_cpp.llama_token_data(token_id, logits[token_id], 0.0)
                         for token_id in range(n_vocab)
-                    ]
+                    ],
                 )
                 candidates_p = llama_cpp.ctypes.pointer(
                     llama_cpp.llama_token_data_array(_arr, len(_arr), False)

From bcf8d9b28bde7923012f8c2c3985ee6dce7b4cad Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 20:17:26 +0200
Subject: [PATCH 066/177] Lint

---
 llama_cpp/server/__main__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index e06c45812..7b74a0ef5 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -67,11 +67,11 @@ def main():
                     import yaml
 
                     config_file_settings = ConfigFileSettings.model_validate_json(
-                        json.dumps(yaml.safe_load(f))
+                        json.dumps(yaml.safe_load(f)),
                     )
                 else:
                     config_file_settings = ConfigFileSettings.model_validate_json(
-                        f.read()
+                        f.read(),
                     )
                 server_settings = ServerSettings.model_validate(config_file_settings)
                 model_settings = config_file_settings.models

From 92f5221d68dd55b2cdd1b4e861638b6f1a0ac624 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 20:21:00 +0200
Subject: [PATCH 067/177] Lint

---
 llama_cpp/server/model.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index afc3f91df..b78d6c4fe 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -26,7 +26,7 @@ def __init__(self, models: List[ModelSettings]) -> None:
 
         # Load default model
         self._current_model = self.load_llama_from_model_settings(
-            self._default_model_settings
+            self._default_model_settings,
         )
         self._current_model_alias = self._default_model_alias
 
@@ -82,7 +82,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
                 )
             else:
                 chat_handler = llama_cpp.llama_chat_format.Llava15ChatHandler(
-                    clip_model_path=settings.clip_model_path, verbose=settings.verbose
+                    clip_model_path=settings.clip_model_path, verbose=settings.verbose,
                 )
         elif settings.chat_format == "obsidian":
             assert settings.clip_model_path is not None, "clip model not found"
@@ -96,7 +96,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
                 )
             else:
                 chat_handler = llama_cpp.llama_chat_format.ObsidianChatHandler(
-                    clip_model_path=settings.clip_model_path, verbose=settings.verbose
+                    clip_model_path=settings.clip_model_path, verbose=settings.verbose,
                 )
         elif settings.chat_format == "llava-1-6":
             assert settings.clip_model_path is not None, "clip model not found"
@@ -124,7 +124,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
                 )
             else:
                 chat_handler = llama_cpp.llama_chat_format.MoondreamChatHandler(
-                    clip_model_path=settings.clip_model_path, verbose=settings.verbose
+                    clip_model_path=settings.clip_model_path, verbose=settings.verbose,
                 )
         elif settings.chat_format == "nanollava":
             assert settings.clip_model_path is not None, "clip model not found"
@@ -138,7 +138,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
                 )
             else:
                 chat_handler = llama_cpp.llama_chat_format.NanoLlavaChatHandler(
-                    clip_model_path=settings.clip_model_path, verbose=settings.verbose
+                    clip_model_path=settings.clip_model_path, verbose=settings.verbose,
                 )
         elif settings.chat_format == "llama-3-vision-alpha":
             assert settings.clip_model_path is not None, "clip model not found"
@@ -152,7 +152,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
                 )
             else:
                 chat_handler = llama_cpp.llama_chat_format.Llama3VisionAlpha(
-                    clip_model_path=settings.clip_model_path, verbose=settings.verbose
+                    clip_model_path=settings.clip_model_path, verbose=settings.verbose,
                 )
         elif settings.chat_format == "hf-autotokenizer":
             assert (
@@ -168,19 +168,19 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
                 settings.hf_tokenizer_config_path is not None
             ), "hf_tokenizer_config_path must be set for hf-tokenizer-config"
             chat_handler = llama_cpp.llama_chat_format.hf_tokenizer_config_to_chat_completion_handler(
-                json.load(open(settings.hf_tokenizer_config_path))
+                json.load(open(settings.hf_tokenizer_config_path)),
             )
 
         tokenizer: Optional[llama_cpp.BaseLlamaTokenizer] = None
         if settings.hf_pretrained_model_name_or_path is not None:
             tokenizer = llama_tokenizer.LlamaHFTokenizer.from_pretrained(
-                settings.hf_pretrained_model_name_or_path
+                settings.hf_pretrained_model_name_or_path,
             )
 
         draft_model = None
         if settings.draft_model is not None:
             draft_model = llama_speculative.LlamaPromptLookupDecoding(
-                num_pred_tokens=settings.draft_model_num_pred_tokens
+                num_pred_tokens=settings.draft_model_num_pred_tokens,
             )
 
         kv_overrides: Optional[Dict[str, Union[bool, int, float, str]]] = None

From be20a80df267be29c15f1ca191b7494a3c5bdda9 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 20:22:07 +0200
Subject: [PATCH 068/177] Lint

---
 llama_cpp/_internals.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index fcc8de826..b58974b8b 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -202,7 +202,7 @@ def tokenize(self, text: bytes, add_bos: bool, special: bool):
             )
             if n_tokens < 0:
                 raise RuntimeError(
-                    f'Failed to tokenize: text="{text}" n_tokens={n_tokens}'
+                    f'Failed to tokenize: text="{text}" n_tokens={n_tokens}',
                 )
         return list(tokens[:n_tokens])
 

From ebfb3759f6df9a6ebc202bf2d9d648b56779e210 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 20:25:29 +0200
Subject: [PATCH 069/177] Lint

---
 examples/low_level_api/low_level_api_chat_cpp.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py
index 9cbf51dae..3cadea7a6 100644
--- a/examples/low_level_api/low_level_api_chat_cpp.py
+++ b/examples/low_level_api/low_level_api_chat_cpp.py
@@ -445,7 +445,7 @@ def generate(self):
                     ],
                 )
                 candidates_p = llama_cpp.ctypes.pointer(
-                    llama_cpp.llama_token_data_array(_arr, len(_arr), False)
+                    llama_cpp.llama_token_data_array(_arr, len(_arr), False),
                 )
 
                 # Apply penalties
@@ -453,7 +453,7 @@ def generate(self):
                 last_n_repeat = min(len(self.last_n_tokens), repeat_last_n, self.n_ctx)
 
                 _arr = (llama_cpp.llama_token * last_n_repeat)(
-                    *self.last_n_tokens[len(self.last_n_tokens) - last_n_repeat :]
+                    *self.last_n_tokens[len(self.last_n_tokens) - last_n_repeat :],
                 )
                 llama_cpp.llama_sample_repetition_penalties(
                     ctx=self.ctx,
@@ -480,7 +480,7 @@ def generate(self):
                     mirostat_mu = 2.0 * self.params.mirostat_tau
                     mirostat_m = 100
                     llama_cpp.llama_sample_temperature(
-                        self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)
+                        self.ctx, candidates_p, llama_cpp.c_float(self.params.temp),
                     )
                     id = llama_cpp.llama_sample_token_mirostat(
                         self.ctx,
@@ -493,7 +493,7 @@ def generate(self):
                 elif self.params.mirostat == 2:
                     mirostat_mu = 2.0 * self.params.mirostat_tau
                     llama_cpp.llama_sample_temperature(
-                        self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)
+                        self.ctx, candidates_p, llama_cpp.c_float(self.params.temp),
                     )
                     id = llama_cpp.llama_sample_token_mirostat_v2(
                         self.ctx,
@@ -529,7 +529,7 @@ def generate(self):
                         min_keep=llama_cpp.c_size_t(1),
                     )
                     llama_cpp.llama_sample_temperature(
-                        self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)
+                        self.ctx, candidates_p, llama_cpp.c_float(self.params.temp),
                     )
                     id = llama_cpp.llama_sample_token(self.ctx, candidates_p)
                 # print("`{}`".format(candidates_p.size))
@@ -600,7 +600,7 @@ def generate(self):
 
             # end of text token
             if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_token_eos(
-                self.ctx
+                self.ctx,
             ):
                 if not self.params.instruct:
                     for i in self.llama_token_eot:
@@ -636,7 +636,7 @@ def token_to_str(self, token_id: int) -> bytes:
         size = 32
         buffer = (ctypes.c_char * size)()
         n = llama_cpp.llama_token_to_piece(
-            self.model, llama_cpp.llama_token(token_id), buffer, size
+            self.model, llama_cpp.llama_token(token_id), buffer, size,
         )
         assert n <= size
         return bytes(buffer[:n])
@@ -709,7 +709,7 @@ def interact(self):
             else:
                 print(self.params.input_prefix, end="")
                 self.input(
-                    f"{self.params.input_prefix}{self.read_input()}{self.params.input_suffix}"
+                    f"{self.params.input_prefix}{self.read_input()}{self.params.input_suffix}",
                 )
                 print(self.params.input_suffix, end="")
             self.set_color(util.CONSOLE_COLOR_DEFAULT)

From 77e88f1df52d0f1436f13b560cc4c6d1955dd540 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 20:44:46 +0200
Subject: [PATCH 070/177] Lint

---
 llama_cpp/llama_cpp.py | 94 +++++++++++++++++++++---------------------
 1 file changed, 47 insertions(+), 47 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 0d30d6be9..bed102ad3 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -77,7 +77,7 @@ def _load_shared_library(lib_base_name: str):
                 raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}")
 
     raise FileNotFoundError(
-        f"Shared library with base name '{lib_base_name}' not found"
+        f"Shared library with base name '{lib_base_name}' not found",
     )
 
 
@@ -119,7 +119,7 @@ class CtypesRef(Generic[CtypesCData]):
 
 def ctypes_function_for_shared_library(lib: ctypes.CDLL):
     def ctypes_function(
-        name: str, argtypes: List[Any], restype: Any, enabled: bool = True
+        name: str, argtypes: List[Any], restype: Any, enabled: bool = True,
     ):
         def decorator(f: F) -> F:
             if enabled:
@@ -576,7 +576,7 @@ class llama_token_data_array(ctypes.Structure):
 
 # typedef bool (*llama_progress_callback)(float progress, void * user_data);
 llama_progress_callback = ctypes.CFUNCTYPE(
-    ctypes.c_bool, ctypes.c_float, ctypes.c_void_p
+    ctypes.c_bool, ctypes.c_float, ctypes.c_void_p,
 )
 
 
@@ -930,7 +930,7 @@ class llama_context_params(ctypes.Structure):
 # // It might not exist for progress report where '.' is output repeatedly.
 # typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
 llama_log_callback = ctypes.CFUNCTYPE(
-    None, ctypes.c_int, ctypes.c_char_p, ctypes.c_void_p
+    None, ctypes.c_int, ctypes.c_char_p, ctypes.c_void_p,
 )
 """Signature for logging events
 Note that text includes the new line character at the end for most events.
@@ -1214,7 +1214,7 @@ def llama_backend_free():
     llama_model_p_ctypes,
 )
 def llama_load_model_from_file(
-    path_model: bytes, params: llama_model_params, /
+    path_model: bytes, params: llama_model_params, /,
 ) -> Optional[llama_model_p]:
     ...
 
@@ -1238,7 +1238,7 @@ def llama_free_model(model: llama_model_p, /):
     llama_context_p_ctypes,
 )
 def llama_new_context_with_model(
-    model: llama_model_p, params: llama_context_params, /
+    model: llama_model_p, params: llama_context_params, /,
 ) -> Optional[llama_context_p]:
     ...
 
@@ -1488,10 +1488,10 @@ def llama_model_n_params(model: llama_model_p, /) -> int:
 # // Get a llama model tensor
 # LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
 @ctypes_function(
-    "llama_get_model_tensor", [llama_model_p_ctypes, ctypes.c_char_p], ctypes.c_void_p
+    "llama_get_model_tensor", [llama_model_p_ctypes, ctypes.c_char_p], ctypes.c_void_p,
 )
 def llama_get_model_tensor(
-    model: llama_model_p, name: Union[ctypes.c_char_p, bytes], /
+    model: llama_model_p, name: Union[ctypes.c_char_p, bytes], /,
 ) -> ctypes.c_void_p:
     """Get a llama model tensor"""
     ...
@@ -1509,7 +1509,7 @@ def llama_model_has_encoder(model: llama_model_p, /) -> bool:
 # // to the decoder to start generating output sequence. For other models, it returns -1.
 # LLAMA_API llama_token llama_model_decoder_start_token(const struct llama_model * model);
 @ctypes_function(
-    "llama_model_decoder_start_token", [llama_model_p_ctypes], ctypes.c_int32
+    "llama_model_decoder_start_token", [llama_model_p_ctypes], ctypes.c_int32,
 )
 def llama_model_decoder_start_token(model: llama_model_p, /) -> int:
     """For encoder-decoder models, this function returns id of the token that must be provided
@@ -1553,7 +1553,7 @@ def llama_model_quantize(
     llama_lora_adapter_p_ctypes,
 )
 def llama_lora_adapter_init(
-    model: llama_model_p, path_lora: bytes, /
+    model: llama_model_p, path_lora: bytes, /,
 ) -> Optional[llama_lora_adapter_p]:
     """Load a LoRA adapter from file
     The loaded adapter will be associated to the given model, and will be free when the model is deleted
@@ -1573,7 +1573,7 @@ def llama_lora_adapter_init(
     ctypes.c_int32,
 )
 def llama_lora_adapter_set(
-    ctx: llama_context_p, adapter: llama_lora_adapter_p, scale: float, /
+    ctx: llama_context_p, adapter: llama_lora_adapter_p, scale: float, /,
 ) -> int:
     """Add a loaded LoRA adapter to given context
     This will not modify model's weight"""
@@ -1591,7 +1591,7 @@ def llama_lora_adapter_set(
     ctypes.c_int32,
 )
 def llama_lora_adapter_remove(
-    ctx: llama_context_p, adapter: llama_lora_adapter_p, /
+    ctx: llama_context_p, adapter: llama_lora_adapter_p, /,
 ) -> int:
     """Remove a LoRA adapter from given context
     Return -1 if the adapter is not present in the context"""
@@ -1758,7 +1758,7 @@ class llama_kv_cache_view(ctypes.Structure):
     llama_kv_cache_view,
 )
 def llama_kv_cache_view_init(
-    ctx: llama_context_p, n_seq_max: Union[ctypes.c_int32, int], /
+    ctx: llama_context_p, n_seq_max: Union[ctypes.c_int32, int], /,
 ) -> llama_kv_cache_view:
     """Create an empty KV cache view. (use only for debugging purposes)"""
     ...
@@ -1775,7 +1775,7 @@ def llama_kv_cache_view_free(view: ctypes.pointer[llama_kv_cache_view], /):  # t
 # // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
 # LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
 @ctypes_function(
-    "llama_kv_cache_view_update", [llama_context_p_ctypes, llama_kv_cache_view_p], None
+    "llama_kv_cache_view_update", [llama_context_p_ctypes, llama_kv_cache_view_p], None,
 )
 def llama_kv_cache_view_update(ctx: llama_context_p, view: CtypesPointerOrRef[llama_kv_cache_view], /):  # type: ignore
     """Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)"""
@@ -1786,7 +1786,7 @@ def llama_kv_cache_view_update(ctx: llama_context_p, view: CtypesPointerOrRef[ll
 # // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
 # LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
 @ctypes_function(
-    "llama_get_kv_cache_token_count", [llama_context_p_ctypes], ctypes.c_int32
+    "llama_get_kv_cache_token_count", [llama_context_p_ctypes], ctypes.c_int32,
 )
 def llama_get_kv_cache_token_count(ctx: llama_context_p, /) -> int:
     """Returns the number of tokens in the KV cache (slow, use only for debug)
@@ -1798,7 +1798,7 @@ def llama_get_kv_cache_token_count(ctx: llama_context_p, /) -> int:
 # // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
 # LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
 @ctypes_function(
-    "llama_get_kv_cache_used_cells", [llama_context_p_ctypes], ctypes.c_int32
+    "llama_get_kv_cache_used_cells", [llama_context_p_ctypes], ctypes.c_int32,
 )
 def llama_get_kv_cache_used_cells(ctx: llama_context_p, /) -> int:
     """Returns the number of used KV cells (i.e. have at least one sequence assigned to them)"""
@@ -1892,7 +1892,7 @@ def llama_kv_cache_seq_cp(
 #         struct llama_context * ctx,
 #                 llama_seq_id   seq_id);
 @ctypes_function(
-    "llama_kv_cache_seq_keep", [llama_context_p_ctypes, llama_seq_id], None
+    "llama_kv_cache_seq_keep", [llama_context_p_ctypes, llama_seq_id], None,
 )
 def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /):
     """Removes all tokens that do not belong to the specified sequence"""
@@ -2062,7 +2062,7 @@ def llama_state_get_data(
     ctypes.c_size_t,
 )
 def llama_copy_state_data(
-    ctx: llama_context_p, dst: CtypesArray[ctypes.c_uint8], /
+    ctx: llama_context_p, dst: CtypesArray[ctypes.c_uint8], /,
 ) -> int:
     """Copies the state to the specified destination address.
     Destination needs to have allocated enough memory.
@@ -2102,7 +2102,7 @@ def llama_state_set_data(
     ctypes.c_size_t,
 )
 def llama_set_state_data(
-    ctx: llama_context_p, src: CtypesArray[ctypes.c_uint8], /
+    ctx: llama_context_p, src: CtypesArray[ctypes.c_uint8], /,
 ) -> int:
     """Set the state reading from the specified address"""
     ...
@@ -2397,7 +2397,7 @@ def llama_batch_get_one(
 #         int32_t embd,
 #         int32_t n_seq_max);
 @ctypes_function(
-    "llama_batch_init", [ctypes.c_int32, ctypes.c_int32, ctypes.c_int32], llama_batch
+    "llama_batch_init", [ctypes.c_int32, ctypes.c_int32, ctypes.c_int32], llama_batch,
 )
 def llama_batch_init(
     n_tokens: Union[ctypes.c_int32, int],
@@ -2578,7 +2578,7 @@ def llama_get_logits(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]:
     ctypes.POINTER(ctypes.c_float),
 )
 def llama_get_logits_ith(
-    ctx: llama_context_p, i: Union[ctypes.c_int32, int], /
+    ctx: llama_context_p, i: Union[ctypes.c_int32, int], /,
 ) -> CtypesArray[ctypes.c_float]:
     """Logits for the ith token. Equivalent to:
     llama_get_logits(ctx) + i*n_vocab"""
@@ -2593,7 +2593,7 @@ def llama_get_logits_ith(
 # // Otherwise, returns NULL.
 # LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
 @ctypes_function(
-    "llama_get_embeddings", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float)
+    "llama_get_embeddings", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float),
 )
 def llama_get_embeddings(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]:
     """Get the embeddings for the input
@@ -2613,7 +2613,7 @@ def llama_get_embeddings(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]
     ctypes.POINTER(ctypes.c_float),
 )
 def llama_get_embeddings_ith(
-    ctx: llama_context_p, i: Union[ctypes.c_int32, int], /
+    ctx: llama_context_p, i: Union[ctypes.c_int32, int], /,
 ) -> CtypesArray[ctypes.c_float]:
     """Get the embeddings for the ith sequence
     llama_get_embeddings(ctx) + i*n_embd"""
@@ -2630,7 +2630,7 @@ def llama_get_embeddings_ith(
     ctypes.POINTER(ctypes.c_float),
 )
 def llama_get_embeddings_seq(
-    ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /
+    ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /,
 ) -> CtypesArray[ctypes.c_float]:
     """Get the embeddings for a sequence id
     Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
@@ -2645,10 +2645,10 @@ def llama_get_embeddings_seq(
 
 # LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token);
 @ctypes_function(
-    "llama_token_get_text", [llama_model_p_ctypes, llama_token], ctypes.c_char_p
+    "llama_token_get_text", [llama_model_p_ctypes, llama_token], ctypes.c_char_p,
 )
 def llama_token_get_text(
-    model: llama_model_p, token: Union[llama_token, int], /
+    model: llama_model_p, token: Union[llama_token, int], /,
 ) -> bytes:
     ...
 
@@ -2658,17 +2658,17 @@ def llama_token_get_text(
     "llama_token_get_score", [llama_model_p_ctypes, llama_token], ctypes.c_float
 )
 def llama_token_get_score(
-    model: llama_model_p, token: Union[llama_token, int], /
+    model: llama_model_p, token: Union[llama_token, int], /,
 ) -> float:
     ...
 
 
 # LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token);
 @ctypes_function(
-    "llama_token_get_attr", [llama_model_p_ctypes, llama_token], ctypes.c_int
+    "llama_token_get_attr", [llama_model_p_ctypes, llama_token], ctypes.c_int,
 )
 def llama_token_get_attr(
-    model: llama_model_p, token: Union[llama_token, int], /
+    model: llama_model_p, token: Union[llama_token, int], /,
 ) -> int:
     ...
 
@@ -2676,7 +2676,7 @@ def llama_token_get_attr(
 # // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
 # LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
 @ctypes_function(
-    "llama_token_is_eog", [llama_model_p_ctypes, llama_token], ctypes.c_bool
+    "llama_token_is_eog", [llama_model_p_ctypes, llama_token], ctypes.c_bool,
 )
 def llama_token_is_eog(model: llama_model_p, token: Union[llama_token, int], /) -> bool:
     """Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)"""
@@ -2686,10 +2686,10 @@ def llama_token_is_eog(model: llama_model_p, token: Union[llama_token, int], /)
 # // Identify if Token Id is a control token or a render-able token
 # LLAMA_API bool llama_token_is_control(const struct llama_model * model, llama_token token);
 @ctypes_function(
-    "llama_token_is_control", [llama_model_p_ctypes, llama_token], ctypes.c_bool
+    "llama_token_is_control", [llama_model_p_ctypes, llama_token], ctypes.c_bool,
 )
 def llama_token_is_control(
-    model: llama_model_p, token: Union[llama_token, int], /
+    model: llama_model_p, token: Union[llama_token, int], /,
 ) -> bool:
     """Identify if Token Id is a control token or a render-able token"""
     ...
@@ -3047,7 +3047,7 @@ def llama_grammar_sample(
     grammar: llama_grammar_p,
     ctx: llama_context_p,
     candidates: Union[
-        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array]
+        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array],
     ],
     /,
 ):
@@ -3068,7 +3068,7 @@ def llama_grammar_sample(
 def llama_sample_grammar(
     ctx: llama_context_p,
     candidates: Union[
-        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array]
+        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array],
     ],
     grammar,  # type: llama_grammar_p
     /,
@@ -3145,7 +3145,7 @@ def llama_set_rng_seed(ctx: llama_context_p, seed: Union[ctypes.c_uint32, int],
 def llama_sample_repetition_penalties(
     ctx: llama_context_p,
     candidates: Union[
-        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array]
+        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array],
     ],
     last_tokens_data: CtypesArray[llama_token],
     penalty_last_n: Union[ctypes.c_size_t, int],
@@ -3202,7 +3202,7 @@ def llama_sample_apply_guidance(
 def llama_sample_softmax(
     ctx: llama_context_p,
     candidates: Union[
-        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array]
+        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array],
     ],
     /,
 ):
@@ -3224,7 +3224,7 @@ def llama_sample_softmax(
 def llama_sample_top_k(
     ctx: llama_context_p,
     candidates: Union[
-        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array]
+        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array],
     ],
     k: Union[ctypes.c_int, int],
     min_keep: Union[ctypes.c_size_t, int],
@@ -3248,7 +3248,7 @@ def llama_sample_top_k(
 def llama_sample_top_p(
     ctx: llama_context_p,
     candidates: Union[
-        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array]
+        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array],
     ],
     p: Union[ctypes.c_float, float],
     min_keep: Union[ctypes.c_size_t, int],
@@ -3272,7 +3272,7 @@ def llama_sample_top_p(
 def llama_sample_min_p(
     ctx: llama_context_p,
     candidates: Union[
-        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array]
+        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array],
     ],
     p: Union[ctypes.c_float, float],
     min_keep: Union[ctypes.c_size_t, int],
@@ -3296,7 +3296,7 @@ def llama_sample_min_p(
 def llama_sample_tail_free(
     ctx: llama_context_p,
     candidates: Union[
-        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array]
+        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array],
     ],
     z: Union[ctypes.c_float, float],
     min_keep: Union[ctypes.c_size_t, int],
@@ -3320,7 +3320,7 @@ def llama_sample_tail_free(
 def llama_sample_typical(
     ctx: llama_context_p,
     candidates: Union[
-        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array]
+        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array],
     ],
     p: Union[ctypes.c_float, float],
     min_keep: Union[ctypes.c_size_t, int],
@@ -3351,7 +3351,7 @@ def llama_sample_typical(
 def llama_sample_entropy(
     ctx: llama_context_p,
     candidates: Union[
-        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array]
+        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array],
     ],
     min_temp: Union[ctypes.c_float, float],
     max_temp: Union[ctypes.c_float, float],
@@ -3374,7 +3374,7 @@ def llama_sample_entropy(
 def llama_sample_temp(
     ctx: llama_context_p,
     candidates: Union[
-        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array]
+        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array],
     ],
     temp: Union[ctypes.c_float, float],
     /,
@@ -3416,7 +3416,7 @@ def llama_sample_temp(
 def llama_sample_token_mirostat(
     ctx: llama_context_p,
     candidates: Union[
-        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array]
+        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array],
     ],
     tau: Union[ctypes.c_float, float],
     eta: Union[ctypes.c_float, float],
@@ -3461,7 +3461,7 @@ def llama_sample_token_mirostat(
 def llama_sample_token_mirostat_v2(
     ctx: llama_context_p,
     candidates: Union[
-        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array]
+        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array],
     ],
     tau: Union[ctypes.c_float, float],
     eta: Union[ctypes.c_float, float],
@@ -3492,7 +3492,7 @@ def llama_sample_token_mirostat_v2(
 def llama_sample_token_greedy(
     ctx: llama_context_p,
     candidates: Union[
-        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array]
+        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array],
     ],
     /,
 ) -> int:
@@ -3512,7 +3512,7 @@ def llama_sample_token_greedy(
 def llama_sample_token(
     ctx: llama_context_p,
     candidates: Union[
-        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array]
+        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array],
     ],
     /,
 ) -> int:

From 907e65f5c5ee3ee79569a681415917a5186d0e3b Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 20:45:37 +0200
Subject: [PATCH 071/177] Lint

---
 examples/low_level_api/ReasonAct.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/low_level_api/ReasonAct.py b/examples/low_level_api/ReasonAct.py
index 20bd33bfa..ebfcceae9 100644
--- a/examples/low_level_api/ReasonAct.py
+++ b/examples/low_level_api/ReasonAct.py
@@ -32,7 +32,7 @@ def env_or_def(env, default):
 Thought: Do I need to use an action? No, I know the answer
 Answer: Paris is the capital of France
 Question:""" + " ".join(
-    sys.argv[1:]
+    sys.argv[1:],
 )
 
 print("Loading model...")

From 79b563d66843855b11fb2b6a9e83a79821f79270 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 20:46:22 +0200
Subject: [PATCH 072/177] Update Chat.py

---
 examples/low_level_api/Chat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/low_level_api/Chat.py b/examples/low_level_api/Chat.py
index fcbc79c69..4f338f8d0 100644
--- a/examples/low_level_api/Chat.py
+++ b/examples/low_level_api/Chat.py
@@ -51,7 +51,7 @@ def env_or_def(env, default):
 {USER_NAME}: What time is it?
 {AI_NAME}: It is {DATE_TIME}.
 {USER_NAME}:""" + " ".join(
-    sys.argv[1:]
+    sys.argv[1:],
 )
 
 print("Loading model...")

From be3c02d14b38161cf7bd95094b6a7b8932e51d6d Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 20:48:15 +0200
Subject: [PATCH 073/177] Lint

---
 llama_cpp/llama_types.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py
index 909a8f779..84bde6ef3 100644
--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@@ -131,7 +131,7 @@ class ChatCompletionStreamResponseDelta(TypedDict):
 class ChatCompletionStreamResponseChoice(TypedDict):
     index: int
     delta: Union[
-        ChatCompletionStreamResponseDelta, ChatCompletionStreamResponseDeltaEmpty
+        ChatCompletionStreamResponseDelta, ChatCompletionStreamResponseDeltaEmpty,
     ]
     finish_reason: Optional[Literal["stop", "length", "tool_calls", "function_call"]]
     logprobs: NotRequired[Optional[CompletionLogprobs]]
@@ -248,7 +248,7 @@ class ChatCompletionRequestFunctionCallOption(TypedDict):
 
 
 ChatCompletionRequestFunctionCall = Union[
-    Literal["none", "auto"], ChatCompletionRequestFunctionCallOption
+    Literal["none", "auto"], ChatCompletionRequestFunctionCallOption,
 ]
 
 ChatCompletionFunctionParameters = Dict[str, JsonType]  # TODO: make this more specific
@@ -275,7 +275,7 @@ class ChatCompletionNamedToolChoice(TypedDict):
 
 
 ChatCompletionToolChoiceOption = Union[
-    Literal["none", "auto", "required"], ChatCompletionNamedToolChoice
+    Literal["none", "auto", "required"], ChatCompletionNamedToolChoice,
 ]
 
 

From 054b29bf23afb82075b4e0138789b8c6c3038828 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 20:48:55 +0200
Subject: [PATCH 074/177] Lint

---
 examples/low_level_api/Miku.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/low_level_api/Miku.py b/examples/low_level_api/Miku.py
index 8d99b0223..abc1f66c4 100644
--- a/examples/low_level_api/Miku.py
+++ b/examples/low_level_api/Miku.py
@@ -37,7 +37,7 @@ def env_or_def(env, default):
 {AI_NAME}: /think I wonder what {USER_NAME} likes to do in his free time? I should ask him about that!
 {AI_NAME}: What do you like to do in your free time? ^_^
 {USER_NAME}:""" + " ".join(
-    sys.argv[1:]
+    sys.argv[1:],
 )
 
 print("Loading model...")

From d2ca3d8ddab2007e644fdc74d948551d3db57975 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 21:54:28 +0200
Subject: [PATCH 075/177] Lint

---
 llama_cpp/llama.py | 68 +++++++++++++++++++++++-----------------------
 1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index e3d093e3d..68b7456da 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -231,12 +231,12 @@ def __init__(
         if self.tensor_split is not None:
             if len(self.tensor_split) > llama_cpp.LLAMA_MAX_DEVICES:
                 raise ValueError(
-                    f"Attempt to split tensors that exceed maximum supported devices. Current LLAMA_MAX_DEVICES={llama_cpp.LLAMA_MAX_DEVICES}"
+                    f"Attempt to split tensors that exceed maximum supported devices. Current LLAMA_MAX_DEVICES={llama_cpp.LLAMA_MAX_DEVICES}",
                 )
             # Type conversion and expand the list to the length of LLAMA_MAX_DEVICES
             FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES
             self._c_tensor_split = FloatArray(
-                *tensor_split  # type: ignore
+                *tensor_split,  # type: ignore
             )  # keep a reference to the array so it is not gc'd
             self.model_params.tensor_split = self._c_tensor_split
         self.model_params.vocab_only = vocab_only
@@ -366,8 +366,8 @@ def __init__(
                     path_model=self.model_path,
                     params=self.model_params,
                     verbose=self.verbose,
-                )
-            )
+                ),
+            ),
         )
 
         # Override tokenizer
@@ -386,8 +386,8 @@ def __init__(
                     model=self._model,
                     params=self.context_params,
                     verbose=self.verbose,
-                )
-            )
+                ),
+            ),
         )
 
         self._batch = self._stack.enter_context(
@@ -397,8 +397,8 @@ def __init__(
                     embd=0,
                     n_seq_max=self.context_params.n_ctx,
                     verbose=self.verbose,
-                )
-            )
+                ),
+            ),
         )
 
         self._lora_adapter: Optional[llama_cpp.llama_lora_adapter_p] = None
@@ -411,14 +411,14 @@ def __init__(
             )
             if self._lora_adapter is None:
                 raise RuntimeError(
-                    f"Failed to initialize LoRA adapter from lora path: {self.lora_path}"
+                    f"Failed to initialize LoRA adapter from lora path: {self.lora_path}",
                 )
             assert self._ctx.ctx is not None
             if llama_cpp.llama_lora_adapter_set(
-                self._ctx.ctx, self._lora_adapter, self.lora_scale
+                self._ctx.ctx, self._lora_adapter, self.lora_scale,
             ):
                 raise RuntimeError(
-                    f"Failed to set LoRA adapter from lora path: {self.lora_path}"
+                    f"Failed to set LoRA adapter from lora path: {self.lora_path}",
                 )
 
         if self.verbose:
@@ -443,11 +443,11 @@ def __init__(
         self.n_tokens = 0
         self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc)
         self.scores: npt.NDArray[np.single] = np.ndarray(
-            (n_ctx, self._n_vocab), dtype=np.single
+            (n_ctx, self._n_vocab), dtype=np.single,
         )
 
         self._mirostat_mu = ctypes.c_float(
-            2.0 * 5.0
+            2.0 * 5.0,
         )  # TODO: Move this to sampling context
 
         try:
@@ -502,7 +502,7 @@ def __init__(
             and "chat_template.default" in template_choices
         ):
             chat_format = llama_chat_format.guess_chat_format_from_gguf_metadata(
-                self.metadata
+                self.metadata,
             )
 
             if chat_format is not None:
@@ -524,7 +524,7 @@ def __init__(
             self.chat_format = "llama-2"
             if self.verbose:
                 print(
-                    f"Using fallback chat format: {self.chat_format}", file=sys.stderr
+                    f"Using fallback chat format: {self.chat_format}", file=sys.stderr,
                 )
 
     @property
@@ -557,7 +557,7 @@ def eval_logits(self) -> Deque[List[float]]:
         )
 
     def tokenize(
-        self, text: bytes, add_bos: bool = True, special: bool = False
+        self, text: bytes, add_bos: bool = True, special: bool = False,
     ) -> List[int]:
         """Tokenize a string.
 
@@ -573,7 +573,7 @@ def tokenize(
         return self.tokenizer_.tokenize(text, add_bos, special)
 
     def detokenize(
-        self, tokens: List[int], prev_tokens: Optional[List[int]] = None
+        self, tokens: List[int], prev_tokens: Optional[List[int]] = None,
     ) -> bytes:
         """Detokenize a list of tokens.
 
@@ -621,7 +621,7 @@ def eval(self, tokens: Sequence[int]):
             n_past = self.n_tokens
             n_tokens = len(batch)
             self._batch.set_batch(
-                batch=batch, n_past=n_past, logits_all=self.context_params.logits_all
+                batch=batch, n_past=n_past, logits_all=self.context_params.logits_all,
             )
             self._ctx.decode(self._batch)
             # Save tokens
@@ -631,14 +631,14 @@ def eval(self, tokens: Sequence[int]):
                 rows = n_tokens
                 cols = self._n_vocab
                 logits = np.ctypeslib.as_array(
-                    self._ctx.get_logits(), shape=(rows * cols,)
+                    self._ctx.get_logits(), shape=(rows * cols,),
                 )
                 self.scores[n_past : n_past + n_tokens, :].reshape(-1)[::] = logits
             else:
                 rows = 1
                 cols = self._n_vocab
                 logits = np.ctypeslib.as_array(
-                    self._ctx.get_logits(), shape=(rows * cols,)
+                    self._ctx.get_logits(), shape=(rows * cols,),
                 )
                 self.scores[n_past + n_tokens - 1, :].reshape(-1)[::] = logits
             # Update n_tokens
@@ -812,7 +812,7 @@ def generate(
 
                 sample_idx += 1
                 if stopping_criteria is not None and stopping_criteria(
-                    self._input_ids, self._scores[-1, :]
+                    self._input_ids, self._scores[-1, :],
                 ):
                     return
                 tokens_or_none = yield token
@@ -829,16 +829,16 @@ def generate(
             if self.draft_model is not None:
                 self.input_ids[self.n_tokens : self.n_tokens + len(tokens)] = tokens
                 draft_tokens = self.draft_model(
-                    self.input_ids[: self.n_tokens + len(tokens)]
+                    self.input_ids[: self.n_tokens + len(tokens)],
                 )
                 tokens.extend(
                     draft_tokens.astype(int)[
                         : self._n_ctx - self.n_tokens - len(tokens)
-                    ]
+                    ],
                 )
 
     def create_embedding(
-        self, input: Union[str, List[str]], model: Optional[str] = None
+        self, input: Union[str, List[str]], model: Optional[str] = None,
     ) -> CreateEmbeddingResponse:
         """Embed a string.
 
@@ -903,7 +903,7 @@ def embed(
 
         if self.context_params.embeddings is False:
             raise RuntimeError(
-                "Llama model must be created with embedding=True to call this method"
+                "Llama model must be created with embedding=True to call this method",
             )
 
         if self.verbose:
@@ -965,7 +965,7 @@ def decode_batch(seq_sizes: List[int]):
             # check for overrun
             if n_tokens > n_batch:
                 raise ValueError(
-                    f"Requested tokens ({n_tokens}) exceed batch size of {n_batch}"
+                    f"Requested tokens ({n_tokens}) exceed batch size of {n_batch}",
                 )
 
             # time to eval batch
@@ -1027,7 +1027,7 @@ def _create_completion(
         grammar: Optional[LlamaGrammar] = None,
         logit_bias: Optional[Dict[str, float]] = None,
     ) -> Union[
-        Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse]
+        Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse],
     ]:
         assert self._ctx is not None
         assert suffix is None or suffix.__class__ is str
@@ -1045,7 +1045,7 @@ def _create_completion(
         )
         bos_tokens: List[int] = [cls_token_id if cls_token_id != -1 else bos_token_id]
         eos_tokens: List[int] = [
-            sep_token_id if sep_token_id != -1 else self.token_eos()
+            sep_token_id if sep_token_id != -1 else self.token_eos(),
         ]
 
         if (
@@ -1134,7 +1134,7 @@ def logit_bias_processor(
                 scores: npt.NDArray[np.single],
             ) -> npt.NDArray[np.single]:
                 new_scores = np.copy(
-                    scores
+                    scores,
                 )  # Does it make sense to copy the whole array or can we just overwrite the original one?
                 for input_id, score in logit_bias_map.items():
                     new_scores[input_id] = score + scores[input_id]
@@ -1151,7 +1151,7 @@ def logit_bias_processor(
 
         if len(prompt_tokens) >= self._n_ctx:
             raise ValueError(
-                f"Requested tokens ({len(prompt_tokens)}) exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
+                f"Requested tokens ({len(prompt_tokens)}) exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}",
             )
 
         if max_tokens is None or max_tokens <= 0:
@@ -1172,17 +1172,17 @@ def logit_bias_processor(
 
         if logprobs is not None and self.context_params.logits_all is False:
             raise ValueError(
-                "logprobs is not supported for models created with logits_all=False"
+                "logprobs is not supported for models created with logits_all=False",
             )
 
         if self.cache:
             try:
                 cache_item = self.cache[prompt_tokens]
                 cache_prefix_len = Llama.longest_token_prefix(
-                    cache_item.input_ids.tolist(), prompt_tokens
+                    cache_item.input_ids.tolist(), prompt_tokens,
                 )
                 eval_prefix_len = Llama.longest_token_prefix(
-                    self._input_ids.tolist(), prompt_tokens
+                    self._input_ids.tolist(), prompt_tokens,
                 )
                 if cache_prefix_len > eval_prefix_len:
                     self.load_state(cache_item)
@@ -1277,7 +1277,7 @@ def logit_bias_processor(
                                 [token],
                                 prev_tokens=prompt_tokens
                                 + completion_tokens[:returned_tokens],
-                            )
+                            ),
                         )
                         # Check if stop sequence is in the token
                         if token_end_position > (

From b5fc1da5851fb7fddd0d3fe821284c20b381eaa8 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 21:59:05 +0200
Subject: [PATCH 076/177] Lint

---
 llama_cpp/llama_tokenizer.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/llama_cpp/llama_tokenizer.py b/llama_cpp/llama_tokenizer.py
index 6f235b658..ee952566b 100644
--- a/llama_cpp/llama_tokenizer.py
+++ b/llama_cpp/llama_tokenizer.py
@@ -14,7 +14,7 @@
 class BaseLlamaTokenizer(abc.ABC):
     @abc.abstractmethod
     def tokenize(
-        self, text: bytes, add_bos: bool = True, special: bool = True
+        self, text: bytes, add_bos: bool = True, special: bool = True,
     ) -> List[int]:
         """Tokenize the text into tokens.
 
@@ -26,7 +26,7 @@ def tokenize(
 
     @abc.abstractmethod
     def detokenize(
-        self, tokens: List[int], prev_tokens: Optional[List[int]] = None
+        self, tokens: List[int], prev_tokens: Optional[List[int]] = None,
     ) -> bytes:
         """Detokenize the tokens into text.
 
@@ -42,20 +42,20 @@ def __init__(self, llama: llama_cpp.Llama):
         self._model = llama._model  # type: ignore
 
     def tokenize(
-        self, text: bytes, add_bos: bool = True, special: bool = True
+        self, text: bytes, add_bos: bool = True, special: bool = True,
     ) -> List[int]:
         return self._model.tokenize(text, add_bos=add_bos, special=special)
 
     def detokenize(
-        self, tokens: List[int], prev_tokens: Optional[List[int]] = None
+        self, tokens: List[int], prev_tokens: Optional[List[int]] = None,
     ) -> bytes:
         return self._model.detokenize(tokens)
 
     def encode(
-        self, text: str, add_bos: bool = True, special: bool = True
+        self, text: str, add_bos: bool = True, special: bool = True,
     ) -> List[int]:
         return self.tokenize(
-            text.encode("utf-8", errors="ignore"), add_bos=add_bos, special=special
+            text.encode("utf-8", errors="ignore"), add_bos=add_bos, special=special,
         )
 
     def decode(self, tokens: List[int]) -> str:
@@ -71,21 +71,21 @@ def __init__(self, hf_tokenizer: Any):
         self.hf_tokenizer = hf_tokenizer
 
     def tokenize(
-        self, text: bytes, add_bos: bool = True, special: bool = True
+        self, text: bytes, add_bos: bool = True, special: bool = True,
     ) -> List[int]:
         return self.hf_tokenizer.encode(
-            text.decode("utf-8", errors="ignore"), add_special_tokens=special
+            text.decode("utf-8", errors="ignore"), add_special_tokens=special,
         )
 
     def detokenize(
-        self, tokens: List[int], prev_tokens: Optional[List[int]] = None
+        self, tokens: List[int], prev_tokens: Optional[List[int]] = None,
     ) -> bytes:
         if prev_tokens is not None:
             text = self.hf_tokenizer.decode(prev_tokens + tokens).encode(
-                "utf-8", errors="ignore"
+                "utf-8", errors="ignore",
             )
             prev_text = self.hf_tokenizer.decode(prev_tokens).encode(
-                "utf-8", errors="ignore"
+                "utf-8", errors="ignore",
             )
             return text[len(prev_text) :]
         else:
@@ -98,9 +98,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: str) -> LlamaHFTokenizer
         except ImportError:
             raise ImportError(
                 "The `transformers` library is required to use the `HFTokenizer`."
-                "You can install it with `pip install transformers`."
+                "You can install it with `pip install transformers`.",
             )
         hf_tokenizer = AutoTokenizer.from_pretrained(
-            pretrained_model_name_or_path=pretrained_model_name_or_path
+            pretrained_model_name_or_path=pretrained_model_name_or_path,
         )
         return cls(hf_tokenizer)

From c355e7c3233ad444055d71d9a4ae1c382e6636e0 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 22:00:36 +0200
Subject: [PATCH 077/177] Lint

---
 examples/low_level_api/low_level_api_llama_cpp.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/low_level_api/low_level_api_llama_cpp.py b/examples/low_level_api/low_level_api_llama_cpp.py
index f3e8eb569..fc944f70b 100644
--- a/examples/low_level_api/low_level_api_llama_cpp.py
+++ b/examples/low_level_api/low_level_api_llama_cpp.py
@@ -19,7 +19,7 @@
 # determine the required inference memory per token:
 tmp = [0, 1, 2, 3]
 llama_cpp.llama_eval(
-    ctx=ctx, tokens=(llama_cpp.c_int * len(tmp))(*tmp), n_tokens=len(tmp), n_past=0
+    ctx=ctx, tokens=(llama_cpp.c_int * len(tmp))(*tmp), n_tokens=len(tmp), n_past=0,
 )  # Deprecated
 
 n_past = 0
@@ -76,10 +76,10 @@
             *[
                 llama_cpp.llama_token_data(token_id, logits[token_id], 0.0)
                 for token_id in range(n_vocab)
-            ]
+            ],
         )
         candidates_p = llama_cpp.ctypes.pointer(
-            llama_cpp.llama_token_data_array(_arr, len(_arr), False)
+            llama_cpp.llama_token_data_array(_arr, len(_arr), False),
         )
 
         _arr = (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data)

From 65262eba2ea22d427352c7bf46a25de10691fd31 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 22:03:39 +0200
Subject: [PATCH 078/177] Lint

---
 examples/low_level_api/low_level_api_llama_cpp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/low_level_api/low_level_api_llama_cpp.py b/examples/low_level_api/low_level_api_llama_cpp.py
index fc944f70b..7bfcb8d89 100644
--- a/examples/low_level_api/low_level_api_llama_cpp.py
+++ b/examples/low_level_api/low_level_api_llama_cpp.py
@@ -114,7 +114,7 @@
             size = 32
             buffer = (ctypes.c_char * size)()
             n = llama_cpp.llama_token_to_piece(
-                model, llama_cpp.llama_token(id), buffer, size
+                model, llama_cpp.llama_token(id), buffer, size,
             )
             assert n <= size
             print(

From bb88cb3d5d775937c2abf1cc33df203ecceb9824 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 22:04:32 +0200
Subject: [PATCH 079/177] Lint

---
 examples/gradio_chat/local.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/gradio_chat/local.py b/examples/gradio_chat/local.py
index 617e340b2..65bbfb79e 100644
--- a/examples/gradio_chat/local.py
+++ b/examples/gradio_chat/local.py
@@ -7,7 +7,7 @@
     repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
     filename="*q8_0.gguf",
     tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
-        "Qwen/Qwen1.5-0.5B"
+        "Qwen/Qwen1.5-0.5B",
     ),
     verbose=False,
 )
@@ -25,7 +25,7 @@ def predict(message, history):
     messages.append({"role": "user", "content": message})
 
     response = llama.create_chat_completion_openai_v1(
-        model=model, messages=messages, stream=True
+        model=model, messages=messages, stream=True,
     )
 
     text = ""

From 28e11f639cfcec071fae8631393634e235b63b69 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 22:05:09 +0200
Subject: [PATCH 080/177] Lint

---
 examples/gradio_chat/server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/gradio_chat/server.py b/examples/gradio_chat/server.py
index 34f4f7548..516c2dad2 100644
--- a/examples/gradio_chat/server.py
+++ b/examples/gradio_chat/server.py
@@ -16,7 +16,7 @@ def predict(message, history):
     messages.append({"role": "user", "content": message})
 
     response = client.chat.completions.create(
-        model=model, messages=messages, stream=True
+        model=model, messages=messages, stream=True,
     )
 
     text = ""

From 749474bbb52b1e282ed753a45edf98a1b95748c7 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 22:08:16 +0200
Subject: [PATCH 081/177] Lint

---
 llama_cpp/llava_cpp.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llama_cpp/llava_cpp.py b/llama_cpp/llava_cpp.py
index dbf8ad19c..33e92b2b0 100644
--- a/llama_cpp/llava_cpp.py
+++ b/llama_cpp/llava_cpp.py
@@ -82,7 +82,7 @@ def _load_shared_library(lib_base_name: str):
                 raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}")
 
     raise FileNotFoundError(
-        f"Shared library with base name '{lib_base_name}' not found"
+        f"Shared library with base name '{lib_base_name}' not found",
     )
 
 
@@ -107,7 +107,7 @@ class CtypesRef(Generic[CtypesCData]):
         pass
 
     CtypesPointerOrRef: TypeAlias = Union[
-        CtypesPointer[CtypesCData], CtypesRef[CtypesCData]
+        CtypesPointer[CtypesCData], CtypesRef[CtypesCData],
     ]
 
     CtypesFuncPointer: TypeAlias = ctypes._FuncPointer  # type: ignore
@@ -117,7 +117,7 @@ class CtypesRef(Generic[CtypesCData]):
 
 def ctypes_function_for_shared_library(lib: ctypes.CDLL):
     def ctypes_function(
-        name: str, argtypes: List[Any], restype: Any, enabled: bool = True
+        name: str, argtypes: List[Any], restype: Any, enabled: bool = True,
     ):
         def decorator(f: F) -> F:
             if enabled:
@@ -165,7 +165,7 @@ class llava_image_embed(Structure):
     c_bool,
 )
 def llava_validate_embed_size(
-    ctx_llama: llama_cpp.llama_context_p, ctx_clip: clip_ctx_p, /
+    ctx_llama: llama_cpp.llama_context_p, ctx_clip: clip_ctx_p, /,
 ) -> bool: ...
 
 
@@ -193,7 +193,7 @@ def llava_image_embed_make_with_bytes(
     POINTER(llava_image_embed),
 )
 def llava_image_embed_make_with_filename(
-    ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], image_path: bytes, /
+    ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], image_path: bytes, /,
 ) -> _Pointer[llava_image_embed]: ...
 
 
@@ -233,7 +233,7 @@ def llava_eval_image_embed(
 # CLIP_API struct clip_ctx * clip_model_load    (const char * fname, int verbosity);
 @ctypes_function("clip_model_load", [c_char_p, c_int], clip_ctx_p_ctypes)
 def clip_model_load(
-    fname: bytes, verbosity: Union[c_int, int], /
+    fname: bytes, verbosity: Union[c_int, int], /,
 ) -> Optional[clip_ctx_p]: ...
 
 

From 7e9bde636ddb40934334858cd681ca03b85bae12 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 22:09:19 +0200
Subject: [PATCH 082/177] Lint

---
 llama_cpp/server/errors.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/server/errors.py b/llama_cpp/server/errors.py
index dae9960ba..03926e292 100644
--- a/llama_cpp/server/errors.py
+++ b/llama_cpp/server/errors.py
@@ -114,10 +114,10 @@ class RouteErrorHandler(APIRoute):
         ],
     ] = {
         compile(
-            r"Requested tokens \((\d+)\) exceed context window of (\d+)"
+            r"Requested tokens \((\d+)\) exceed context window of (\d+)",
         ): ErrorResponseFormatters.context_length_exceeded,
         compile(
-            r"Model path does not exist: (.+)"
+            r"Model path does not exist: (.+)",
         ): ErrorResponseFormatters.model_not_found,
     }
 

From 4056760721317701075c9d152b4b523a1b9a2858 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 22:11:48 +0200
Subject: [PATCH 083/177] Lint

---
 llama_cpp/llama_cpp.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index bed102ad3..21799bcc9 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -109,7 +109,7 @@ class CtypesRef(Generic[CtypesCData]):
         pass
 
     CtypesPointerOrRef: TypeAlias = Union[
-        CtypesPointer[CtypesCData], CtypesRef[CtypesCData]
+        CtypesPointer[CtypesCData], CtypesRef[CtypesCData],
     ]
 
     CtypesFuncPointer: TypeAlias = ctypes._FuncPointer  # type: ignore
@@ -214,7 +214,7 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
 # from ggml-backend.h
 # typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
 ggml_backend_sched_eval_callback = ctypes.CFUNCTYPE(
-    ctypes.c_bool, ctypes.c_void_p, ctypes.c_bool, ctypes.c_void_p
+    ctypes.c_bool, ctypes.c_void_p, ctypes.c_bool, ctypes.c_void_p,
 )
 
 # // Abort callback
@@ -2553,7 +2553,7 @@ def llama_synchronize(ctx: llama_context_p, /):
 # // Cols: n_vocab
 # LLAMA_API float * llama_get_logits(struct llama_context * ctx);
 @ctypes_function(
-    "llama_get_logits", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float)
+    "llama_get_logits", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float),
 )
 def llama_get_logits(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]:
     """Token logits obtained from the last call to llama_eval()
@@ -2655,7 +2655,7 @@ def llama_token_get_text(
 
 # LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
 @ctypes_function(
-    "llama_token_get_score", [llama_model_p_ctypes, llama_token], ctypes.c_float
+    "llama_token_get_score", [llama_model_p_ctypes, llama_token], ctypes.c_float,
 )
 def llama_token_get_score(
     model: llama_model_p, token: Union[llama_token, int], /,

From c6c4f8caad6394c99a52ecc4c6c29d86fba6a634 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 22:23:30 +0200
Subject: [PATCH 084/177] Lint

---
 llama_cpp/llama.py | 62 +++++++++++++++++++++++-----------------------
 1 file changed, 31 insertions(+), 31 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 68b7456da..f8e749dc1 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -1294,7 +1294,7 @@ def logit_bias_processor(
                                 completion_tokens[:returned_tokens],
                                 prev_tokens=prompt_tokens
                                 + completion_tokens[:returned_tokens],
-                            ).decode("utf-8", errors="ignore")
+                            ).decode("utf-8", errors="ignore"),
                         )
                         token_offset = len(prompt_tokens) + returned_tokens
                         logits = self._scores[token_offset - 1, :]
@@ -1305,7 +1305,7 @@ def logit_bias_processor(
                             )
                         top_logprob = {
                             self.detokenize([i]).decode(
-                                "utf-8", errors="ignore"
+                                "utf-8", errors="ignore",
                             ): logprob
                             for logprob, i in sorted_logprobs[:logprobs]
                         }
@@ -1316,7 +1316,7 @@ def logit_bias_processor(
                                     [token],
                                     prev_tokens=prompt_tokens
                                     + completion_tokens[:returned_tokens],
-                                ).decode("utf-8", errors="ignore")
+                                ).decode("utf-8", errors="ignore"),
                             ],
                             "text_offset": [text_offset],
                             "token_logprobs": [current_logprobs[int(token)]],
@@ -1338,7 +1338,7 @@ def logit_bias_processor(
                                     "index": 0,
                                     "logprobs": logprobs_or_none,
                                     "finish_reason": None,
-                                }
+                                },
                             ],
                         }
                 else:
@@ -1390,7 +1390,7 @@ def logit_bias_processor(
                 break
 
         if stopping_criteria is not None and stopping_criteria(
-            self._input_ids, self._scores[-1, :]
+            self._input_ids, self._scores[-1, :],
         ):
             text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens)
             finish_reason = "stop"
@@ -1416,7 +1416,7 @@ def logit_bias_processor(
                     self.detokenize(
                         [token],
                         prev_tokens=prompt_tokens + completion_tokens[:returned_tokens],
-                    )
+                    ),
                 )
 
                 logprobs_or_none: Optional[CompletionLogprobs] = None
@@ -1424,14 +1424,14 @@ def logit_bias_processor(
                     if token == bos_token_id:
                         continue
                     token_str = self.detokenize([token]).decode(
-                        "utf-8", errors="ignore"
+                        "utf-8", errors="ignore",
                     )
                     text_offset = len(prompt) + len(
                         self.detokenize(
                             completion_tokens[:returned_tokens],
                             prev_tokens=prompt_tokens
                             + completion_tokens[:returned_tokens],
-                        )
+                        ),
                     )
                     token_offset = len(prompt_tokens) + returned_tokens - 1
                     logits = self._scores[token_offset, :]
@@ -1447,7 +1447,7 @@ def logit_bias_processor(
                     top_logprob.update({token_str: current_logprobs[int(token)]})
                     logprobs_or_none = {
                         "tokens": [
-                            self.detokenize([token]).decode("utf-8", errors="ignore")
+                            self.detokenize([token]).decode("utf-8", errors="ignore"),
                         ],
                         "text_offset": [text_offset],
                         "token_logprobs": [current_logprobs[int(token)]],
@@ -1472,7 +1472,7 @@ def logit_bias_processor(
                                 "index": 0,
                                 "logprobs": logprobs_or_none,
                                 "finish_reason": None,
-                            }
+                            },
                         ],
                     }
                     break
@@ -1485,12 +1485,12 @@ def logit_bias_processor(
                     "choices": [
                         {
                             "text": self.detokenize([token]).decode(
-                                "utf-8", errors="ignore"
+                                "utf-8", errors="ignore",
                             ),
                             "index": 0,
                             "logprobs": logprobs_or_none,
                             "finish_reason": None,
-                        }
+                        },
                     ],
                 }
             yield {
@@ -1504,7 +1504,7 @@ def logit_bias_processor(
                         "index": 0,
                         "logprobs": None,
                         "finish_reason": finish_reason,
-                    }
+                    },
                 ],
             }
             if self.cache:
@@ -1547,14 +1547,14 @@ def logit_bias_processor(
 
             all_token_strs = [
                 self.detokenize([token], prev_tokens=all_tokens[:i]).decode(
-                    "utf-8", errors="ignore"
+                    "utf-8", errors="ignore",
                 )
                 for i, token in enumerate(all_tokens)
             ]
             all_logprobs = Llama.logits_to_logprobs(self._scores)[token_offset:]
             # TODO: may be able to change this loop to use np.take_along_dim
             for idx, (token, token_str, logprobs_token) in enumerate(
-                zip(all_tokens, all_token_strs, all_logprobs)
+                zip(all_tokens, all_token_strs, all_logprobs),
             ):
                 if token == bos_token_id:
                     continue
@@ -1562,18 +1562,18 @@ def logit_bias_processor(
                     text_offset
                     + len(
                         self.detokenize(all_tokens[:idx]).decode(
-                            "utf-8", errors="ignore"
-                        )
-                    )
+                            "utf-8", errors="ignore",
+                        ),
+                    ),
                 )
                 tokens.append(token_str)
                 sorted_logprobs = sorted(
-                        zip(logprobs_token, range(len(logprobs_token))), reverse=True
+                        zip(logprobs_token, range(len(logprobs_token))), reverse=True,
                     )
                 token_logprobs.append(logprobs_token[int(token)])
                 top_logprob: Optional[Dict[str, float]] = {
                     self.detokenize([i], prev_tokens=all_tokens[:idx]).decode(
-                        "utf-8", errors="ignore"
+                        "utf-8", errors="ignore",
                     ): logprob
                     for logprob, i in sorted_logprobs[:logprobs]
                 }
@@ -1603,7 +1603,7 @@ def logit_bias_processor(
                     "index": 0,
                     "logprobs": logprobs_or_none,
                     "finish_reason": finish_reason,
-                }
+                },
             ],
             "usage": {
                 "prompt_tokens": len(prompt_tokens),
@@ -1832,7 +1832,7 @@ def create_chat_completion(
         logprobs: Optional[bool] = None,
         top_logprobs: Optional[int] = None,
     ) -> Union[
-        CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse]
+        CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse],
     ]:
         """Generate a chat completion from a list of messages.
 
@@ -1934,7 +1934,7 @@ def create_chat_completion_openai_v1(
         except ImportError:
             raise ImportError(
                 "To use create_chat_completion_openai_v1, you must install the openai package."
-                "You can install it with `pip install openai`."
+                "You can install it with `pip install openai`.",
             )
 
     def __getstate__(self):
@@ -2082,7 +2082,7 @@ def __del__(self) -> None:
 
     @staticmethod
     def logits_to_logprobs(
-        logits: Union[npt.NDArray[np.single], List], axis: int = -1
+        logits: Union[npt.NDArray[np.single], List], axis: int = -1,
     ) -> npt.NDArray[np.single]:
         # https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.log_softmax.html
         logits_maxs: np.ndarray = np.amax(logits, axis=axis, keepdims=True)
@@ -2137,7 +2137,7 @@ def from_pretrained(
         except ImportError:
             raise ImportError(
                 "Llama.from_pretrained requires the huggingface-hub package. "
-                "You can install it with `pip install huggingface-hub`."
+                "You can install it with `pip install huggingface-hub`.",
             )
 
         validate_repo_id(repo_id)
@@ -2160,13 +2160,13 @@ def from_pretrained(
         if len(matching_files) == 0:
             raise ValueError(
                 f"No file found in {repo_id} that match {filename}\n\n"
-                f"Available Files:\n{json.dumps(file_list)}"
+                f"Available Files:\n{json.dumps(file_list)}",
             )
 
         if len(matching_files) > 1:
             raise ValueError(
                 f"Multiple files found in {repo_id} matching {filename}\n\n"
-                f"Available Files:\n{json.dumps(files)}"
+                f"Available Files:\n{json.dumps(files)}",
             )
 
         (matching_file,) = matching_files
@@ -2220,13 +2220,13 @@ def __init__(
 
 
 LogitsProcessor = Callable[
-    [npt.NDArray[np.intc], npt.NDArray[np.single]], npt.NDArray[np.single]
+    [npt.NDArray[np.intc], npt.NDArray[np.single]], npt.NDArray[np.single],
 ]
 
 
 class LogitsProcessorList(List[LogitsProcessor]):
     def __call__(
-        self, input_ids: npt.NDArray[np.intc], scores: npt.NDArray[np.single]
+        self, input_ids: npt.NDArray[np.intc], scores: npt.NDArray[np.single],
     ) -> npt.NDArray[np.single]:
         for processor in self:
             scores = processor(input_ids, scores)
@@ -2238,7 +2238,7 @@ def __call__(
 
 class StoppingCriteriaList(List[StoppingCriteria]):
     def __call__(
-        self, input_ids: npt.NDArray[np.intc], logits: npt.NDArray[np.single]
+        self, input_ids: npt.NDArray[np.intc], logits: npt.NDArray[np.single,
     ) -> bool:
         return any([stopping_criteria(input_ids, logits) for stopping_criteria in self])
 
@@ -2250,7 +2250,7 @@ def __init__(self, min_tokens: int, token_eos: int):
         self.prompt_tokens = None
 
     def __call__(
-        self, input_ids: npt.NDArray[np.intc], scores: npt.NDArray[np.single]
+        self, input_ids: npt.NDArray[np.intc], scores: npt.NDArray[np.single],
     ) -> npt.NDArray[np.single]:
         if self.prompt_tokens is None:
             self.prompt_tokens = len(input_ids)

From 142d2c6605b1bdb37ad8bc150e33287ed22a46cb Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 23:07:06 +0200
Subject: [PATCH 085/177] Lint

---
 llama_cpp/llama_chat_format.py | 276 ++++++++++++++++-----------------
 1 file changed, 138 insertions(+), 138 deletions(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 194fec07f..09def4c13 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -118,7 +118,7 @@ def register_chat_completion_handler(
     ):
         if not overwrite and name in self._chat_handlers:
             raise ValueError(
-                f"Formatter with name '{name}' is already registered. Use `overwrite=True` to overwrite it."
+                f"Formatter with name '{name}' is already registered. Use `overwrite=True` to overwrite it.",
             )
         self._chat_handlers[name] = chat_handler
 
@@ -129,20 +129,20 @@ def unregister_chat_handler(self, name: str):
             raise ValueError(f"No formatter registered under the name '{name}'.")
 
     def get_chat_completion_handler_by_name(
-        self, name: str
+        self, name: str,
     ) -> LlamaChatCompletionHandler:
         try:
             chat_handler = self._chat_handlers[name]
             return chat_handler
         except KeyError:
             raise LlamaChatCompletionHandlerNotFoundException(
-                f"Invalid chat handler: {name} (valid formats: {list(self._chat_handlers.keys())})"
+                f"Invalid chat handler: {name} (valid formats: {list(self._chat_handlers.keys())})",
             )
 
 
 def get_chat_completion_handler(name: str) -> LlamaChatCompletionHandler:
     return LlamaChatCompletionHandlerRegistry().get_chat_completion_handler_by_name(
-        name
+        name,
     )
 
 
@@ -238,7 +238,7 @@ def raise_exception(message: str):
         if self.stop_token_ids is not None:
 
             def stop_on_last_token(
-                tokens: npt.NDArray[np.intc], logits: npt.NDArray[np.single]
+                tokens: npt.NDArray[np.intc], logits: npt.NDArray[np.single],
             ) -> bool:
                 return tokens[-1] in self.stop_token_ids
 
@@ -273,7 +273,7 @@ def _convert_text_completion_to_chat(
                 },
                 "logprobs": completion["choices"][0]["logprobs"],
                 "finish_reason": completion["choices"][0]["finish_reason"],
-            }
+            },
         ],
         "usage": completion["usage"],
     }
@@ -297,7 +297,7 @@ def _convert_text_completion_chunks_to_chat(
                         },
                         "logprobs": None,
                         "finish_reason": None,
-                    }
+                    },
                 ],
             }
         yield {
@@ -317,7 +317,7 @@ def _convert_text_completion_chunks_to_chat(
                     ),
                     "logprobs": chunk["choices"][0]["logprobs"],
                     "finish_reason": chunk["choices"][0]["finish_reason"],
-                }
+                },
             ],
         }
 
@@ -329,7 +329,7 @@ def _convert_completion_to_chat(
     ],
     stream: bool = False,
 ) -> Union[
-    llama_types.CreateChatCompletionResponse, Iterator[llama_types.ChatCompletionChunk]
+    llama_types.CreateChatCompletionResponse, Iterator[llama_types.ChatCompletionChunk],
 ]:
     if stream:
         chunks: Iterator[llama_types.CreateCompletionStreamResponse] = completion_or_chunks  # type: ignore
@@ -375,12 +375,12 @@ def _convert_completion_to_chat_function(
                                     "name": tool_name,
                                     "arguments": completion["choices"][0]["text"],
                                 },
-                            }
+                            },
                         ],
                     },
                     "logprobs": completion["choices"][0]["logprobs"],
                     "finish_reason": "tool_calls",
-                }
+                },
             ],
             "usage": completion["usage"],
         }
@@ -419,7 +419,7 @@ def _stream_response_to_function_stream(
                                     "function_call": None,
                                     "tool_calls": None,
                                 },
-                            }
+                            },
                         ],
                     }
                     yield {
@@ -450,10 +450,10 @@ def _stream_response_to_function_stream(
                                                     "text"
                                                 ],
                                             },
-                                        }
+                                        },
                                     ],
                                 },
-                            }
+                            },
                         ],
                     }
                     first = False
@@ -485,10 +485,10 @@ def _stream_response_to_function_stream(
                                             "name": tool_name,
                                             "arguments": chunk["choices"][0]["text"],
                                         },
-                                    }
+                                    },
                                 ],
                             },
-                        }
+                        },
                     ],
                 }
 
@@ -509,7 +509,7 @@ def _stream_response_to_function_stream(
                                 "function_call": None,
                                 "tool_calls": None,
                             },
-                        }
+                        },
                     ],
                 }
 
@@ -580,7 +580,7 @@ def chat_completion_handler(
 
         if response_format is not None and response_format["type"] == "json_object":
             grammar = _grammar_for_response_format(
-                response_format, verbose=llama.verbose
+                response_format, verbose=llama.verbose,
             )
 
         # Convert legacy functions to tools
@@ -621,13 +621,13 @@ def chat_completion_handler(
             try:
                 # create grammar from json schema
                 grammar = llama_grammar.LlamaGrammar.from_json_schema(
-                    json.dumps(schema), verbose=llama.verbose
+                    json.dumps(schema), verbose=llama.verbose,
                 )
             except Exception as e:
                 if llama.verbose:
                     print(str(e), file=sys.stderr)
                 grammar = llama_grammar.LlamaGrammar.from_string(
-                    llama_grammar.JSON_GBNF, verbose=llama.verbose
+                    llama_grammar.JSON_GBNF, verbose=llama.verbose,
                 )
 
         completion_or_chunks = llama.create_completion(
@@ -658,7 +658,7 @@ def chat_completion_handler(
         if tool is not None:
             tool_name = tool["function"]["name"]
             return _convert_completion_to_chat_function(
-                tool_name, completion_or_chunks, stream
+                tool_name, completion_or_chunks, stream,
             )
         return _convert_completion_to_chat(completion_or_chunks, stream=stream)
 
@@ -666,7 +666,7 @@ def chat_completion_handler(
 
 
 def hf_autotokenizer_to_chat_formatter(
-    pretrained_model_name_or_path: Union[str, os.PathLike[str]]
+    pretrained_model_name_or_path: Union[str, os.PathLike[str]],
 ) -> ChatFormatter:
     # https://huggingface.co/docs/transformers/main/chat_templating
     # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1#instruction-format
@@ -684,14 +684,14 @@ def format_autotokenizer(
         assert isinstance(prompt, str)
         # Return formatted prompt and eos token by default
         return ChatFormatterResponse(
-            prompt=prompt, stop=tokenizer.eos_token, added_special=True
+            prompt=prompt, stop=tokenizer.eos_token, added_special=True,
         )
 
     return format_autotokenizer
 
 
 def hf_autotokenizer_to_chat_completion_handler(
-    pretrained_model_name_or_path: Union[str, os.PathLike[str]]
+    pretrained_model_name_or_path: Union[str, os.PathLike[str]],
 ) -> LlamaChatCompletionHandler:
     chat_formatter = hf_autotokenizer_to_chat_formatter(pretrained_model_name_or_path)
     return chat_formatter_to_chat_completion_handler(chat_formatter)
@@ -730,7 +730,7 @@ def format_tokenizer_config(
             messages = [
                 *messages,
                 llama_types.ChatCompletionRequestAssistantMessage(
-                    role="assistant", content=""
+                    role="assistant", content="",
                 ),
             ]
         prompt = env.render(
@@ -739,7 +739,7 @@ def format_tokenizer_config(
             eos_token=eos_token,
         )
         return ChatFormatterResponse(
-            prompt=prompt, stop=[eos_token, bos_token], added_special=True
+            prompt=prompt, stop=[eos_token, bos_token], added_special=True,
         )
 
     return format_tokenizer_config
@@ -750,7 +750,7 @@ def hf_tokenizer_config_to_chat_completion_handler(
     add_generation_prompt: bool = True,
 ) -> LlamaChatCompletionHandler:
     chat_formatter = hf_tokenizer_config_to_chat_formatter(
-        tokenizer_config, add_generation_prompt=add_generation_prompt
+        tokenizer_config, add_generation_prompt=add_generation_prompt,
     )
     return chat_formatter_to_chat_completion_handler(chat_formatter)
 
@@ -805,7 +805,7 @@ def _map_roles(
 
 
 def _format_llama2(
-    system_message: str, messages: List[Tuple[str, Optional[str]]], sep: str, sep2: str
+    system_message: str, messages: List[Tuple[str, Optional[str]]], sep: str, sep2: str,
 ) -> str:
     """Format the prompt with the llama2 style."""
     seps = [sep, sep2]
@@ -822,7 +822,7 @@ def _format_llama2(
 
 
 def _format_add_colon_single(
-    system_message: str, messages: List[Tuple[str, Optional[str]]], sep: str
+    system_message: str, messages: List[Tuple[str, Optional[str]]], sep: str,
 ) -> str:
     """Format the prompt with the add-colon-single style."""
     ret = system_message + sep
@@ -835,7 +835,7 @@ def _format_add_colon_single(
 
 
 def _format_add_colon_two(
-    system_message: str, messages: List[Tuple[str, Optional[str]]], sep: str, sep2: str
+    system_message: str, messages: List[Tuple[str, Optional[str]]], sep: str, sep2: str,
 ) -> str:
     """Format the prompt with the add-colon-two style."""
     seps = [sep, sep2]
@@ -849,7 +849,7 @@ def _format_add_colon_two(
 
 
 def _format_no_colon_single(
-    system_message: str, messages: List[Tuple[str, Optional[str]]], sep: str
+    system_message: str, messages: List[Tuple[str, Optional[str]]], sep: str,
 ) -> str:
     """Format the prompt with the no-colon-single style."""
     ret = system_message
@@ -862,7 +862,7 @@ def _format_no_colon_single(
 
 
 def _format_add_colon_space_single(
-    system_message: str, messages: List[Tuple[str, Optional[str]]], sep: str
+    system_message: str, messages: List[Tuple[str, Optional[str]]], sep: str,
 ) -> str:
     """Format the prompt with the add-colon-space-single style."""
     ret = system_message + sep
@@ -875,7 +875,7 @@ def _format_add_colon_space_single(
 
 
 def _format_chatml(
-    system_message: str, messages: List[Tuple[str, Optional[str]]], sep: str
+    system_message: str, messages: List[Tuple[str, Optional[str]]], sep: str,
 ) -> str:
     """Format the prompt with the chatml style."""
     ret = "" if system_message == "" else system_message + sep + "\n"
@@ -888,7 +888,7 @@ def _format_chatml(
 
 
 def _format_chatglm3(
-    system_message: str, messages: List[Tuple[str, Optional[str]]], sep: str
+    system_message: str, messages: List[Tuple[str, Optional[str]]], sep: str,
 ) -> str:
     """Format the prompt with the chatglm3 style."""
     ret = ""
@@ -904,12 +904,12 @@ def _format_chatglm3(
 
 def _grammar_for_json(verbose: bool = False):
     return llama_grammar.LlamaGrammar.from_string(
-        llama_grammar.JSON_GBNF, verbose=verbose
+        llama_grammar.JSON_GBNF, verbose=verbose,
     )
 
 
 def _grammar_for_json_schema(
-    schema: str, verbose: bool = False, fallback_to_json: bool = True
+    schema: str, verbose: bool = False, fallback_to_json: bool = True,
 ):
     try:
         return llama_grammar.LlamaGrammar.from_json_schema(schema, verbose=verbose)
@@ -929,7 +929,7 @@ def _grammar_for_response_format(
 
     if "schema" in response_format:
         return _grammar_for_json_schema(
-            json.dumps(response_format["schema"]), verbose=verbose
+            json.dumps(response_format["schema"]), verbose=verbose,
         )
     else:
         return _grammar_for_json(verbose=verbose)
@@ -942,7 +942,7 @@ def register_chat_format(name: str):
     def decorator(f: ChatFormatter):
         chat_completion_handler = chat_formatter_to_chat_completion_handler(f)
         LlamaChatCompletionHandlerRegistry().register_chat_completion_handler(
-            name, chat_completion_handler
+            name, chat_completion_handler,
         )
         return f
 
@@ -1308,7 +1308,7 @@ def format_openchat(
     system_message = _get_system_message(messages)
     system_message = system_template.format(system_message=system_message)
     _roles = dict(
-        user="GPT4 Correct User: ", assistant="<|end_of_turn|>GPT4 Correct Assistant: "
+        user="GPT4 Correct User: ", assistant="<|end_of_turn|>GPT4 Correct Assistant: ",
     )
     _sep = "<|end_of_turn|>"
     _messages = _map_roles(messages, _roles)
@@ -1349,7 +1349,7 @@ def format_gemma(
     system_message = _get_system_message(messages)
     if system_message != "":
         logger.debug(
-            "`role='system'` messages are not allowed on Google's Gemma models."
+            "`role='system'` messages are not allowed on Google's Gemma models.",
         )
     _roles = dict(user="<start_of_turn>user\n", assistant="<start_of_turn>model\n")
     _sep = "<end_of_turn>\n"
@@ -1394,7 +1394,7 @@ def functionary_chat_handler(
     SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
 
     def generate_type_definition(
-        param: Dict[str, llama_types.JsonType], indent_level: int, shared_defs
+        param: Dict[str, llama_types.JsonType], indent_level: int, shared_defs,
     ) -> str:
         indent = "  " * indent_level
         if "$ref" in param:
@@ -1412,7 +1412,7 @@ def generate_type_definition(
             nested_schema = "{\n"
             for nested_param_name, nested_param in properties.items():
                 nested_param_type = generate_type_definition(
-                    nested_param, indent_level + 1, shared_defs
+                    nested_param, indent_level + 1, shared_defs,
                 )
                 nested_schema += (
                     f"{indent}  {nested_param_name}: {nested_param_type},\n"
@@ -1433,12 +1433,12 @@ def generate_shared_definitions(shared_defs, indent_level: int) -> str:
             shared_definitions += f"{indent}type {def_name} = "
             if def_properties.get("type") == "object":
                 shared_definitions += generate_type_definition(
-                    def_properties, indent_level, shared_defs
+                    def_properties, indent_level, shared_defs,
                 )
             elif "enum" in def_properties:
                 # Enum type
                 shared_definitions += " | ".join(
-                    [f'"{enum_value}"' for enum_value in def_properties["enum"]]
+                    [f'"{enum_value}"' for enum_value in def_properties["enum"]],
                 )
             shared_definitions += ";\n"
         return shared_definitions
@@ -1486,8 +1486,8 @@ def prepare_messages_for_inference(
         if functions is not None:
             all_messages.append(
                 llama_types.ChatCompletionRequestSystemMessage(
-                    role="system", content=generate_schema_from_functions(functions)
-                )
+                    role="system", content=generate_schema_from_functions(functions),
+                ),
             )
 
         if tools is not None:
@@ -1499,15 +1499,15 @@ def prepare_messages_for_inference(
                             tool["function"]
                             for tool in tools
                             if tool["type"] == "function"
-                        ]
+                        ],
                     ),
-                )
+                ),
             )
 
         all_messages.append(
             llama_types.ChatCompletionRequestSystemMessage(
-                role="system", content=SYSTEM_MESSAGE
-            )
+                role="system", content=SYSTEM_MESSAGE,
+            ),
         )
 
         for message in messages:
@@ -1523,8 +1523,8 @@ def prepare_messages_for_inference(
 
         all_messages.append(
             llama_types.ChatCompletionRequestAssistantMessage(
-                role="assistant", content=None
-            )
+                role="assistant", content=None,
+            ),
         )
 
         def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
@@ -1603,7 +1603,7 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
     ):
         stop = "\n"
         completion: llama_types.Completion = llama.create_completion(
-            prompt=prompt, stop=stop, stream=False
+            prompt=prompt, stop=stop, stream=False,
         )  # type: ignore
         completion_text = completion["choices"][0]["text"]
         # strip " to=functions." and ending ":"
@@ -1631,7 +1631,7 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
         try:
             with suppress_stdout_stderr(disable=llama.verbose):
                 grammar_text = llama_grammar.json_schema_to_gbnf(
-                    json.dumps(function_body)
+                    json.dumps(function_body), 
                 )
                 grammar = llama_grammar.LlamaGrammar.from_string(
                     llama_grammar.json_schema_to_gbnf(json.dumps(function_body)),
@@ -1641,7 +1641,7 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
         except Exception as e:
             if llama.verbose:
                 print(
-                    "Failed to parse function body as JSON schema, falling back to default grammar"
+                    "Failed to parse function body as JSON schema, falling back to default grammar",
                 )
                 print(e)
             with suppress_stdout_stderr(disable=llama.verbose):
@@ -1652,7 +1652,7 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
     else:
         with suppress_stdout_stderr(disable=llama.verbose):
             grammar = llama_grammar.LlamaGrammar.from_string(
-                llama_grammar.JSON_GBNF, verbose=llama.verbose
+                llama_grammar.JSON_GBNF, verbose=llama.verbose,
             )
 
     completion: llama_types.Completion = llama.create_completion(
@@ -1709,12 +1709,12 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
                                 "name": function_call,
                                 "arguments": completion["choices"][0]["text"],
                             },
-                        }
+                        },
                     ],
                 },
                 "logprobs": completion["choices"][0]["logprobs"],
                 "finish_reason": "tool_calls",
-            }
+            },
         ],
         usage=completion["usage"],
     )
@@ -1754,7 +1754,7 @@ def functionary_v1_v2_chat_handler(
 
     tokenizer = llama.tokenizer_
     assert hasattr(
-        tokenizer, "hf_tokenizer"
+        tokenizer, "hf_tokenizer",
     ), "Please provide a valid hf_tokenizer_path from https://huggingface.co/meetkai when initializing the Llama class"
     from transformers import AutoTokenizer
 
@@ -1774,7 +1774,7 @@ def functionary_v1_v2_chat_handler(
         CONTENT_TOKEN = "<|content|>"
 
     def generate_type_definition(
-        param: Dict[str, llama_types.JsonType], indent_level: int, shared_defs
+        param: Dict[str, llama_types.JsonType], indent_level: int, shared_defs,
     ) -> str:
         indent = "  " * indent_level
         if "$ref" in param:
@@ -1813,12 +1813,12 @@ def generate_shared_definitions(shared_defs, indent_level: int) -> str:
             shared_definitions += f"{indent}type {def_name} = "
             if def_properties.get("type") == "object":
                 shared_definitions += generate_type_definition(
-                    def_properties, indent_level, shared_defs
+                    def_properties, indent_level, shared_defs,
                 )
             elif "enum" in def_properties:
                 # Enum type
                 shared_definitions += " | ".join(
-                    [f'"{enum_value}"' for enum_value in def_properties["enum"]]
+                    [f'"{enum_value}"' for enum_value in def_properties["enum"]],
                 )
             shared_definitions += ";\n"
         return shared_definitions
@@ -1869,14 +1869,14 @@ def prepare_messages_for_inference(
         if tool_choice == "none":
             all_messages.append(
                 llama_types.ChatCompletionRequestSystemMessage(
-                    role="system", content=generate_schema_from_functions([])
-                )
+                    role="system", content=generate_schema_from_functions([]),
+                ),
             )
         elif functions is not None:
             all_messages.append(
                 llama_types.ChatCompletionRequestSystemMessage(
-                    role="system", content=generate_schema_from_functions(functions)
-                )
+                    role="system", content=generate_schema_from_functions(functions),
+                ),
             )
         elif tools is not None and tool_choice != "none":
             all_messages.append(
@@ -1887,15 +1887,15 @@ def prepare_messages_for_inference(
                             tool["function"]
                             for tool in tools
                             if tool["type"] == "function"
-                        ]
+                        ],
                     ),
-                 )
+                 ),
             )
 
         all_messages.append(
             llama_types.ChatCompletionRequestSystemMessage(
-                role="system", content=SYSTEM_MESSAGE
-            )
+                role="system", content=SYSTEM_MESSAGE,
+            ),
         )
 
         for message in messages:
@@ -1932,7 +1932,7 @@ def prepare_messages_for_inference(
         function_call = "auto"
 
     prompt = prepare_messages_for_inference(
-        messages, tokenizer, version, functions, tools, function_call
+        messages, tokenizer, version, functions, tools, function_call,
     )
 
     # If no tools/functions are provided
@@ -1984,21 +1984,21 @@ def get_grammar(function_call):
         try:
             with suppress_stdout_stderr(disable=llama.verbose):
                 grammar_text = llama_grammar.json_schema_to_gbnf(
-                    json.dumps(function_body)
+                    json.dumps(function_body),
                 )
                 grammar = llama_grammar.LlamaGrammar.from_string(
-                    llama_grammar.json_schema_to_gbnf(json.dumps(function_body))
+                    llama_grammar.json_schema_to_gbnf(json.dumps(function_body)),
                 )
                 print(grammar_text)
         except Exception as e:
             if llama.verbose:
                 print(
-                    "Failed to parse function body as JSON schema, falling back to default grammar"
+                    "Failed to parse function body as JSON schema, falling back to default grammar",
                 )
                 print(e)
             with suppress_stdout_stderr(disable=llama.verbose):
                 grammar = llama_grammar.LlamaGrammar.from_string(
-                    llama_grammar.JSON_GBNF, verbose=llama.verbose
+                    llama_grammar.JSON_GBNF, verbose=llama.verbose,
                 )
 
         return grammar
@@ -2046,7 +2046,7 @@ def generate_streaming(tools, functions, function_call, prompt):
             grammar = get_grammar(function_call["name"])
             stops = [STOP_TOKEN, FROM_TOKEN]
             tool_id = "".join(
-                [random.choice(string.ascii_letters + string.digits) for _ in range(24)]
+                [random.choice(string.ascii_letters + string.digits) for _ in range(24)],
             )
             completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
             completion_text = ""
@@ -2065,15 +2065,15 @@ def generate_streaming(tools, functions, function_call, prompt):
                                         "name": function_call["name"],
                                         "arguments": "",
                                     },
-                                }
-                            ]
+                                },
+                            ],
                         }
                     else:
                         func_call_dict = {
                             "function_call": {
                                 "name": function_call["name"],
                                 "arguments": "",
-                            }
+                            },
                         }
                     yield llama_types.CreateChatCompletionStreamResponse(
                         id="chat" + chunk["id"],
@@ -2089,7 +2089,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                     "content": None,
                                     **func_call_dict,
                                 },
-                            }
+                            },
                         ],
                     )
                     first = False
@@ -2104,15 +2104,15 @@ def generate_streaming(tools, functions, function_call, prompt):
                                     "name": None,
                                     "arguments": chunk["choices"][0]["text"].rstrip(),
                                 },
-                            }
-                        ]
+                            },
+                        ],
                     }
                 else:
                     func_call_dict = {
                         "function_call": {
                             "name": None,
                             "arguments": chunk["choices"][0]["text"].rstrip(),
-                        }
+                        },
                     }
                 if len(chunk["choices"][0]["text"].rstrip()) > 0:
                     yield llama_types.CreateChatCompletionStreamResponse(
@@ -2151,7 +2151,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                             "function_call": None,
                             "tool_calls": None,
                         },
-                    }
+                    },
                 ],
             )
         # If "auto" or no tool_choice/function_call
@@ -2162,7 +2162,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                 grammar = None
                 stops = CONTENT_TOKEN
                 completion = create_completion(
-                    prompt=prompt, stop=stops, grammar=grammar
+                    prompt=prompt, stop=stops, grammar=grammar,
                 )
                 completion_text = ""
                 for chunk in completion:
@@ -2186,7 +2186,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                 "delta": {"role": "assistant", "content": ""},
                                 "logprobs": None,
                                 "finish_reason": None,
-                            }
+                            },
                         ],
                     )
                 else:
@@ -2196,7 +2196,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                         [
                             random.choice(string.ascii_letters + string.digits)
                             for _ in range(24)
-                        ]
+                        ],
                     )
                     if tools is not None:
                         func_call_dict = {
@@ -2231,13 +2231,13 @@ def generate_streaming(tools, functions, function_call, prompt):
                                     "content": None,
                                     **func_call_dict,
                                 },
-                            }
+                            },
                         ],
                     )
                 # Generate content
                 stops = [RECIPIENT_TOKEN, STOP_TOKEN]
                 completion = create_completion(
-                    prompt=prompt, stop=stops, grammar=grammar
+                    prompt=prompt, stop=stops, grammar=grammar,
                 )
                 if function_name == "all":
                     completion_text = ""
@@ -2270,7 +2270,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                                     "role": "assistant",
                                                     "content": buffer.pop(0),
                                                 },
-                                            }
+                                            },
                                         ],
                                     )
                                 is_end = False
@@ -2299,7 +2299,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                                 ].lstrip()
                                             ),
                                         },
-                                    }
+                                    },
                                 ],
                             )
                     # Check whether the model wants to generate another turn
@@ -2331,7 +2331,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                     "delta": {},
                                     "logprobs": None,
                                     "finish_reason": "stop",
-                                }
+                                },
                             ],
                         )
                         break
@@ -2354,8 +2354,8 @@ def generate_streaming(tools, functions, function_call, prompt):
                                                     "text"
                                                 ].rstrip(),
                                             },
-                                        }
-                                    ]
+                                        },
+                                    ],
                                 }
                             else:
                                 func_call_dict = {
@@ -2364,7 +2364,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                         "arguments": chunk["choices"][0][
                                             "text"
                                         ].rstrip(),
-                                    }
+                                    },
                                 }
                             yield llama_types.CreateChatCompletionStreamResponse(
                                 id="chat" + chunk_id,
@@ -2380,16 +2380,16 @@ def generate_streaming(tools, functions, function_call, prompt):
                                             "content": None,
                                             **func_call_dict,
                                         },
-                                    }
+                                    },
                                 ],
                             )
                     prompt += completion_text.strip()
                     grammar = None
                     completion = create_completion(
-                        prompt=prompt, stop=stops, grammar=grammar
+                        prompt=prompt, stop=stops, grammar=grammar,
                     )
                     completion_text += "".join(
-                        [chunk["choices"][0]["text"] for chunk in completion]
+                        [chunk["choices"][0]["text"] for chunk in completion],
                     )
                     if (
                         "<|from|> assistant" in completion_text
@@ -2419,14 +2419,14 @@ def generate_streaming(tools, functions, function_call, prompt):
                                         "function_call": None,
                                         "tool_calls": None,
                                     },
-                                }
+                                },
                             ],
                         )
                         break
 
     if stream is not False:
         return generate_streaming(
-            tools=tools, functions=functions, function_call=function_call, prompt=prompt
+            tools=tools, functions=functions, function_call=function_call, prompt=prompt,
         )
     else:
         if version == "v1":
@@ -2462,16 +2462,16 @@ def generate_streaming(tools, functions, function_call, prompt):
             ):
                 prompt += (
                     completion_text.replace(
-                        f"{START_FUNCTION_CALL_TOKEN} ", START_FUNCTION_CALL_TOKEN
+                        f"{START_FUNCTION_CALL_TOKEN} ", START_FUNCTION_CALL_TOKEN,
                     )
                     + "\n"
                 )
                 function_calls.append(
-                    completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip()
+                    completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip(),
                 )
                 grammar = get_grammar(function_calls[-1])
                 completion = create_completion(
-                    prompt=prompt, stop=END_FUNCTION_CALL_TOKEN, grammar=grammar
+                    prompt=prompt, stop=END_FUNCTION_CALL_TOKEN, grammar=grammar,
                 )
                 completion_tokens += completion["usage"]["completion_tokens"]
                 function_bodies.append(completion["choices"][0]["text"].strip())
@@ -2485,7 +2485,7 @@ def generate_streaming(tools, functions, function_call, prompt):
             grammar = get_grammar(function_call)
             stops = [STOP_TOKEN, FROM_TOKEN]
             completion = create_completion(
-                prompt=prompt, stop=stops, grammar=grammar
+                prompt=prompt, stop=stops, grammar=grammar,
             )
             completion_text = completion["choices"][0]["text"]
             completion_tokens += completion["usage"]["completion_tokens"]
@@ -2497,7 +2497,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                 grammar = None
                 stops = CONTENT_TOKEN
                 completion = create_completion(
-                    prompt=prompt, stop=stops, grammar=grammar
+                    prompt=prompt, stop=stops, grammar=grammar,
                 )
                 completion_text = completion["choices"][0]["text"]
                 completion_tokens += completion["usage"]["completion_tokens"]
@@ -2512,7 +2512,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                 # Generate content
                 stops = [RECIPIENT_TOKEN, STOP_TOKEN]
                 completion = create_completion(
-                    prompt=prompt, stop=stops, grammar=grammar
+                    prompt=prompt, stop=stops, grammar=grammar,
                 )
                 completion_text = completion["choices"][0]["text"]
                 completion_tokens += completion["usage"]["completion_tokens"]
@@ -2548,7 +2548,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                     prompt += completion_text.strip()
                     grammar = None
                     completion = create_completion(
-                        prompt=prompt, stop=stops, grammar=grammar
+                        prompt=prompt, stop=stops, grammar=grammar,
                     )
                     completion_tokens += completion["usage"]["completion_tokens"]
                     if (
@@ -2571,14 +2571,14 @@ def generate_streaming(tools, functions, function_call, prompt):
                         [
                             random.choice(string.ascii_letters + string.digits)
                             for _ in range(24)
-                        ]
+                        ],
                     ),
                     "type": "function",
                     "function": {
                         "name": function_call,
                         "arguments": function_body,
                     },
-                }
+                },
             )
 
         # TODO: support stream mode
@@ -2613,7 +2613,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                         **function_call_dict,
                     },
                     "finish_reason": "tool_calls" if len(tool_calls) > 0 else "stop",
-                }
+                },
             ],
             usage=completion["usage"],
         )
@@ -2745,8 +2745,8 @@ def __call__(
         if system_prompt == "" and self.DEFAULT_SYSTEM_MESSAGE is not None:
             messages = [
                 llama_types.ChatCompletionRequestSystemMessage(
-                    role="system", content=self.DEFAULT_SYSTEM_MESSAGE
-                )
+                    role="system", content=self.DEFAULT_SYSTEM_MESSAGE,
+                ),
             ] + messages
 
         image_urls = self.get_image_urls(messages)
@@ -2779,7 +2779,7 @@ def embed_image_bytes(image_bytes: bytes):
                     self.clip_ctx,
                     llama.context_params.n_threads_batch,
                     (ctypes.c_uint8 * len(image_bytes)).from_buffer(
-                        bytearray(image_bytes)
+                        bytearray(image_bytes),
                     ),
                     len(image_bytes),
                 )
@@ -2793,7 +2793,7 @@ def embed_image_bytes(image_bytes: bytes):
         for type_, value in split_text:
             if type_ == "text":
                 tokens = llama.tokenize(
-                    value.encode("utf8"), add_bos=False, special=True
+                    value.encode("utf8"), add_bos=False, special=True,
                 )
                 if llama.n_tokens + len(tokens) > llama.n_ctx():
                     raise ValueError(
@@ -2805,7 +2805,7 @@ def embed_image_bytes(image_bytes: bytes):
                 embed = embed_image_bytes(image_bytes)
                 if llama.n_tokens + embed.contents.n_image_pos > llama.n_ctx():
                     raise ValueError(
-                        f"Prompt exceeds n_ctx: {llama.n_tokens + embed.contents.n_image_pos} > {llama.n_ctx()}"
+                        f"Prompt exceeds n_ctx: {llama.n_tokens + embed.contents.n_image_pos} > {llama.n_ctx()}",
                     )
                 n_past = ctypes.c_int(llama.n_tokens)
                 n_past_p = ctypes.pointer(n_past)
@@ -2864,13 +2864,13 @@ def embed_image_bytes(image_bytes: bytes):
             try:
                 # create grammar from json schema
                 grammar = llama_grammar.LlamaGrammar.from_json_schema(
-                    json.dumps(schema), verbose=llama.verbose
+                    json.dumps(schema), verbose=llama.verbose,
                 )
             except Exception as e:
                 if llama.verbose:
                     print(str(e), file=sys.stderr)
                 grammar = llama_grammar.LlamaGrammar.from_string(
-                    llama_grammar.JSON_GBNF, verbose=llama.verbose
+                    llama_grammar.JSON_GBNF, verbose=llama.verbose,
                 )
 
         completion_or_chunks = llama.create_completion(
@@ -2900,7 +2900,7 @@ def embed_image_bytes(image_bytes: bytes):
         if tool is not None:
             tool_name = tool["function"]["name"]
             return _convert_completion_to_chat_function(
-                tool_name, completion_or_chunks, stream
+                tool_name, completion_or_chunks, stream,
             )
         return _convert_completion_to_chat(completion_or_chunks, stream=stream)
 
@@ -2981,7 +2981,7 @@ def from_pretrained(
         except ImportError:
             raise ImportError(
                 "Llama.from_pretrained requires the huggingface-hub package. "
-                "You can install it with `pip install huggingface-hub`."
+                "You can install it with `pip install huggingface-hub`.",
             )
 
         validate_repo_id(repo_id)
@@ -3004,13 +3004,13 @@ def from_pretrained(
         if len(matching_files) == 0:
             raise ValueError(
                 f"No file found in {repo_id} that match {filename}\n\n"
-                f"Available Files:\n{json.dumps(file_list)}"
+                f"Available Files:\n{json.dumps(file_list)}",
             )
 
         if len(matching_files) > 1:
             raise ValueError(
                 f"Multiple files found in {repo_id} matching {filename}\n\n"
-                f"Available Files:\n{json.dumps(files)}"
+                f"Available Files:\n{json.dumps(files)}",
             )
 
         (matching_file,) = matching_files
@@ -3468,7 +3468,7 @@ def chatml_function_calling(
     if isinstance(tool_choice, dict):
         tool_name = tool_choice["function"]["name"]
         tool = next(
-            (tool for tool in tools if tool["function"]["name"] == tool_name), None
+            (tool for tool in tools if tool["function"]["name"] == tool_name), None,
         )
         if tool is None:
             raise ValueError(f"Tool with name '{tool_name}' not found in tools")
@@ -3481,15 +3481,15 @@ def chatml_function_calling(
         prompt += f"functions.{tool_name}:\n"
         try:
             grammar = llama_grammar.LlamaGrammar.from_json_schema(
-                json.dumps(tool["function"]["parameters"]), verbose=llama.verbose
+                json.dumps(tool["function"]["parameters"]), verbose=llama.verbose,
             )
         except Exception as e:
             grammar = llama_grammar.LlamaGrammar.from_string(
-                llama_grammar.JSON_GBNF, verbose=llama.verbose
+                llama_grammar.JSON_GBNF, verbose=llama.verbose,
             )
             if llama.verbose:
                 print(
-                    "Failed to parse function body as JSON schema, falling back to default grammar"
+                    "Failed to parse function body as JSON schema, falling back to default grammar",
                 )
                 print(e)
         completion_or_chunks = llama.create_completion(
@@ -3514,13 +3514,13 @@ def chatml_function_calling(
             grammar=grammar,
         )
         return _convert_completion_to_chat_function(
-            tool_name, completion_or_chunks, stream
+            tool_name, completion_or_chunks, stream,
         )
 
     # Case 3: Automatic tool choice
     assert isinstance(tool_choice, str) and tool_choice == "auto"
     function_names = " | ".join(
-        [f'''"functions.{tool['function']['name']}:"''' for tool in tools]
+        [f'''"functions.{tool['function']['name']}:"''' for tool in tools],
     )
     initial_gbnf_tool_grammar = (
         """root   ::= functions | "message:"\n"""
@@ -3556,7 +3556,7 @@ def chatml_function_calling(
         model=model,
         logits_processor=logits_processor,
         grammar=llama_grammar.LlamaGrammar.from_string(
-            initial_gbnf_tool_grammar, verbose=llama.verbose
+            initial_gbnf_tool_grammar, verbose=llama.verbose,
         ),
     )
     completion: llama_types.CreateCompletionResponse = completion_or_chunks  # type: ignore
@@ -3584,7 +3584,7 @@ def chatml_function_calling(
                 model=model,
                 logits_processor=logits_processor,
                 grammar=llama_grammar.LlamaGrammar.from_string(
-                    follow_up_gbnf_tool_grammar, verbose=llama.verbose
+                    follow_up_gbnf_tool_grammar, verbose=llama.verbose,
                 ),
             ),
             stream=stream,
@@ -3600,15 +3600,15 @@ def chatml_function_calling(
             prompt += f"functions.{tool_name}:\n"
             try:
                 grammar = llama_grammar.LlamaGrammar.from_json_schema(
-                    json.dumps(tool["function"]["parameters"]), verbose=llama.verbose
+                    json.dumps(tool["function"]["parameters"]), verbose=llama.verbose,
                 )
             except Exception as e:
                 grammar = llama_grammar.LlamaGrammar.from_string(
-                    llama_grammar.JSON_GBNF, verbose=llama.verbose
+                    llama_grammar.JSON_GBNF, verbose=llama.verbose,
                 )
                 if llama.verbose:
                     print(
-                        "Failed to parse function body as JSON schema, falling back to default grammar"
+                        "Failed to parse function body as JSON schema, falling back to default grammar",
                     )
                     print(e)
             completion_or_chunks = llama.create_completion(
@@ -3633,7 +3633,7 @@ def chatml_function_calling(
                 grammar=grammar,
             )
             completion_or_chunks = cast(
-                llama_types.CreateCompletionResponse, completion_or_chunks
+                llama_types.CreateCompletionResponse, completion_or_chunks,
             )
             completions.append(completion_or_chunks)
             completions_tool_name.append(tool_name)
@@ -3660,14 +3660,14 @@ def chatml_function_calling(
                 model=model,
                 logits_processor=logits_processor,
                 grammar=llama_grammar.LlamaGrammar.from_string(
-                    follow_up_gbnf_tool_grammar, verbose=llama.verbose
+                    follow_up_gbnf_tool_grammar, verbose=llama.verbose,
                 ),
             )
             response = cast(llama_types.CreateCompletionResponse, response)
 
             tool_name = response["choices"][0]["text"][len("functions.") :]
             tool = next(
-                (tool for tool in tools if tool["function"]["name"] == tool_name), None
+                (tool for tool in tools if tool["function"]["name"] == tool_name), None,
             )
 
         # Merge completions
@@ -3682,7 +3682,7 @@ def chatml_function_calling(
                 "function_call": {
                     "name": tool_name,
                     "arguments": completions[0]["choices"][0]["text"],
-                }
+                },
             }
             if len(completions) == 1
             else {}
@@ -3714,12 +3714,12 @@ def chatml_function_calling(
                                 },
                             }
                             for i, (tool_name, completion) in enumerate(
-                                zip(completions_tool_name, completions)
+                                zip(completions_tool_name, completions),
                             )
                         ],
                         **function_call_dict,
                     },
-                }
+                },
             ],
             "usage": {
                 "completion_tokens": sum(

From 4ebb74e7f1ef0b030423479effca3989308fdeb9 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 23:08:15 +0200
Subject: [PATCH 086/177] Lint

---
 llama_cpp/server/model.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index b78d6c4fe..4ce33814f 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -110,7 +110,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
                 )
             else:
                 chat_handler = llama_cpp.llama_chat_format.Llava16ChatHandler(
-                    clip_model_path=settings.clip_model_path, verbose=settings.verbose
+                    clip_model_path=settings.clip_model_path, verbose=settings.verbose,
                 )
         elif settings.chat_format == "moondream":
             assert settings.clip_model_path is not None, "clip model not found"
@@ -160,7 +160,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
             ), "hf_pretrained_model_name_or_path must be set for hf-autotokenizer"
             chat_handler = (
                 llama_cpp.llama_chat_format.hf_autotokenizer_to_chat_completion_handler(
-                    settings.hf_pretrained_model_name_or_path
+                    settings.hf_pretrained_model_name_or_path,
                 )
             )
         elif settings.chat_format == "hf-tokenizer-config":

From 0a2753ae866eb258e067cce8f1fd335c83d3f2bc Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 23:08:59 +0200
Subject: [PATCH 087/177] Lint

---
 llama_cpp/server/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/server/cli.py b/llama_cpp/server/cli.py
index b2b2ac6cd..c8bb46fa6 100644
--- a/llama_cpp/server/cli.py
+++ b/llama_cpp/server/cli.py
@@ -92,5 +92,5 @@ def parse_model_from_args(model: T, args: argparse.Namespace) -> T:
             k: v
             for k, v in vars(args).items()
             if v is not None and k in model.model_fields
-        }
+        },
     )

From 7921e76f1117ccf0218994eea40335c0ccb2d18a Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 23:13:53 +0200
Subject: [PATCH 088/177] Lint

---
 llama_cpp/server/app.py | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index ec7da0712..87dcf5567 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -109,7 +109,7 @@ def create_app(
                 import yaml
 
                 config_file_settings = ConfigFileSettings.model_validate_json(
-                    json.dumps(yaml.safe_load(f))
+                    json.dumps(yaml.safe_load(f)),
                 )
             else:
                 config_file_settings = ConfigFileSettings.model_validate_json(f.read())
@@ -236,10 +236,10 @@ async def authenticate(
                 "application/json": {
                     "schema": {
                         "anyOf": [
-                            {"$ref": "#/components/schemas/CreateCompletionResponse"}
+                            {"$ref": "#/components/schemas/CreateCompletionResponse"},
                         ],
                         "title": "Completion response, when stream=False",
-                    }
+                    },
                 },
                 "text/event-stream": {
                     "schema": {
@@ -247,10 +247,10 @@ async def authenticate(
                         "title": "Server Side Streaming response, when stream=True. "
                         + "See SSE format: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format",
                         "example": """data: {... see CreateCompletionResponse ...} \\n\\n data: ... \\n\\n ... data: [DONE]""",
-                    }
+                    },
                 },
             },
-        }
+        },
     },
     tags=[openai_v1_tag],
 )
@@ -266,7 +266,7 @@ async def create_completion(
 ) -> llama_cpp.Completion:
     exit_stack = contextlib.ExitStack()
     llama_proxy = await run_in_threadpool(
-        lambda: exit_stack.enter_context(contextlib.contextmanager(get_llama_proxy)())
+        lambda: exit_stack.enter_context(contextlib.contextmanager(get_llama_proxy)()),
     )
     if llama_proxy is None:
         raise HTTPException(
@@ -280,7 +280,7 @@ async def create_completion(
     llama = llama_proxy(
         body.model
         if request.url.path != "/v1/engines/copilot-codex/completions"
-        else "copilot-codex"
+        else "copilot-codex",
     )
 
     exclude = {
@@ -304,7 +304,7 @@ async def create_completion(
 
     if body.min_tokens > 0:
         _min_tokens_logits_processor = llama_cpp.LogitsProcessorList(
-            [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())]
+            [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())],
         )
         if "logits_processor" not in kwargs:
             kwargs["logits_processor"] = _min_tokens_logits_processor
@@ -373,11 +373,11 @@ async def create_embedding(
                     "schema": {
                         "anyOf": [
                             {
-                                "$ref": "#/components/schemas/CreateChatCompletionResponse"
-                            }
+                                "$ref": "#/components/schemas/CreateChatCompletionResponse",
+                            },
                         ],
                         "title": "Completion response, when stream=False",
-                    }
+                    },
                 },
                 "text/event-stream": {
                     "schema": {
@@ -385,10 +385,10 @@ async def create_embedding(
                         "title": "Server Side Streaming response, when stream=True"
                         + "See SSE format: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format",
                         "example": """data: {... see CreateChatCompletionResponse ...} \\n\\n data: ... \\n\\n ... data: [DONE]""",
-                    }
+                    },
                 },
             },
-        }
+        },
     },
     tags=[openai_v1_tag],
 )
@@ -440,7 +440,7 @@ async def create_chat_completion(
                                     "required": ["name", "age"],
                                 },
                             },
-                        }
+                        },
                     ],
                     "tool_choice": {
                         "type": "function",
@@ -462,7 +462,7 @@ async def create_chat_completion(
                     "top_logprobs": 10,
                 },
             },
-        }
+        },
     ),
 ) -> llama_cpp.ChatCompletion:
     # This is a workaround for an issue in FastAPI dependencies
@@ -471,7 +471,7 @@ async def create_chat_completion(
     # https://github.com/tiangolo/fastapi/issues/11143
     exit_stack = contextlib.ExitStack()
     llama_proxy = await run_in_threadpool(
-        lambda: exit_stack.enter_context(contextlib.contextmanager(get_llama_proxy)())
+        lambda: exit_stack.enter_context(contextlib.contextmanager(get_llama_proxy)()),
     )
     if llama_proxy is None:
         raise HTTPException(
@@ -498,7 +498,7 @@ async def create_chat_completion(
 
     if body.min_tokens > 0:
         _min_tokens_logits_processor = llama_cpp.LogitsProcessorList(
-            [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())]
+            [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())],
         )
         if "logits_processor" not in kwargs:
             kwargs["logits_processor"] = _min_tokens_logits_processor
@@ -506,7 +506,7 @@ async def create_chat_completion(
             kwargs["logits_processor"].extend(_min_tokens_logits_processor)
 
     iterator_or_completion: Union[
-        llama_cpp.ChatCompletion, Iterator[llama_cpp.ChatCompletionChunk]
+        llama_cpp.ChatCompletion, Iterator[llama_cpp.ChatCompletionChunk],
     ] = await run_in_threadpool(llama.create_chat_completion, **kwargs)
 
     if isinstance(iterator_or_completion, Iterator):

From 3564f2dc574fb0fa458a2a717afd091d248d984d Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 23:20:30 +0200
Subject: [PATCH 089/177] Lint

---
 llama_cpp/server/types.py | 44 +++++++++++++++++++--------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py
index e95ab11ac..4b4298d06 100644
--- a/llama_cpp/server/types.py
+++ b/llama_cpp/server/types.py
@@ -8,11 +8,11 @@
 import llama_cpp
 
 model_field = Field(
-    description="The model to use for generating completions.", default=None
+    description="The model to use for generating completions.", default=None,
 )
 
 max_tokens_field = Field(
-    default=16, ge=1, description="The maximum number of tokens to generate."
+    default=16, ge=1, description="The maximum number of tokens to generate.",
 )
 
 min_tokens_field = Field(
@@ -96,7 +96,7 @@
 )
 
 mirostat_eta_field = Field(
-    default=0.1, ge=0.001, le=1.0, description="Mirostat learning rate"
+    default=0.1, ge=0.001, le=1.0, description="Mirostat learning rate",
 )
 
 grammar = Field(
@@ -107,14 +107,14 @@
 
 class CreateCompletionRequest(BaseModel):
     prompt: Union[str, List[str]] = Field(
-        default="", description="The prompt to generate completions for."
+        default="", description="The prompt to generate completions for.",
     )
     suffix: Optional[str] = Field(
         default=None,
         description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots.",
     )
     max_tokens: Optional[int] = Field(
-        default=16, ge=0, description="The maximum number of tokens to generate."
+        default=16, ge=0, description="The maximum number of tokens to generate.",
     )
     min_tokens: int = min_tokens_field
     temperature: float = temperature_field
@@ -157,9 +157,9 @@ class CreateCompletionRequest(BaseModel):
                 {
                     "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
                     "stop": ["\n", "###"],
-                }
-            ]
-        }
+                },
+            ],
+        },
     }
 
 
@@ -173,24 +173,24 @@ class CreateEmbeddingRequest(BaseModel):
             "examples": [
                 {
                     "input": "The food was delicious and the waiter...",
-                }
-            ]
-        }
+                },
+            ],
+        },
     }
 
 
 class ChatCompletionRequestMessage(BaseModel):
     role: Literal["system", "user", "assistant", "function"] = Field(
-        default="user", description="The role of the message."
+        default="user", description="The role of the message.",
     )
     content: Optional[str] = Field(
-        default="", description="The content of the message."
+        default="", description="The content of the message.",
     )
 
 
 class CreateChatCompletionRequest(BaseModel):
     messages: List[llama_cpp.ChatCompletionRequestMessage] = Field(
-        default=[], description="A list of messages to generate completions for."
+        default=[], description="A list of messages to generate completions for.",
     )
     functions: Optional[List[llama_cpp.ChatCompletionFunction]] = Field(
         default=None,
@@ -255,15 +255,15 @@ class CreateChatCompletionRequest(BaseModel):
                 {
                     "messages": [
                         ChatCompletionRequestMessage(
-                            role="system", content="You are a helpful assistant."
+                            role="system", content="You are a helpful assistant.",
                         ).model_dump(),
                         ChatCompletionRequestMessage(
-                            role="user", content="What is the capital of France?"
+                            role="user", content="What is the capital of France?",
                         ).model_dump(),
-                    ]
-                }
-            ]
-        }
+                    ],
+                },
+            ],
+        },
     }
 
 
@@ -284,7 +284,7 @@ class TokenizeInputRequest(BaseModel):
     input: str = Field(description="The input to tokenize.")
 
     model_config = {
-        "json_schema_extra": {"examples": [{"input": "How many tokens in this query?"}]}
+        "json_schema_extra": {"examples": [{"input": "How many tokens in this query?"}]},
     }
 
 
@@ -311,5 +311,5 @@ class DetokenizeInputResponse(BaseModel):
     text: str = Field(description="The detokenized text.")
 
     model_config = {
-        "json_schema_extra": {"example": {"text": "How many tokens in this query?"}}
+        "json_schema_extra": {"example": {"text": "How many tokens in this query?"}},
     }

From a0a58d818f7e4bcbfeb53eb5a0efc816e7f092ee Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 23:24:37 +0200
Subject: [PATCH 090/177] Lint

---
 llama_cpp/llama_chat_format.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 09def4c13..6968ee08d 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -1631,7 +1631,7 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
         try:
             with suppress_stdout_stderr(disable=llama.verbose):
                 grammar_text = llama_grammar.json_schema_to_gbnf(
-                    json.dumps(function_body), 
+                    json.dumps(function_body),
                 )
                 grammar = llama_grammar.LlamaGrammar.from_string(
                     llama_grammar.json_schema_to_gbnf(json.dumps(function_body)),
@@ -1792,7 +1792,7 @@ def generate_type_definition(
             nested_schema = "{\n"
             for nested_param_name, nested_param in properties.items():
                 nested_param_type = generate_type_definition(
-                    nested_param, indent_level + 1, shared_defs
+                    nested_param, indent_level + 1, shared_defs,
                 )
                 nested_schema += (
                     f"{indent}  {nested_param_name}: {nested_param_type},\n"
@@ -2129,7 +2129,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                     "content": None,
                                     **func_call_dict,
                                 },
-                            }
+                            },
                         ],
                     )
             # Yield tool_call/function_call stop message
@@ -2209,8 +2209,8 @@ def generate_streaming(tools, functions, function_call, prompt):
                                         "name": function_name,
                                         "arguments": "",
                                     },
-                                }
-                            ]
+                                },
+                            ],
                         }
                     else:
                         func_call_dict = {
@@ -2797,7 +2797,7 @@ def embed_image_bytes(image_bytes: bytes):
                 )
                 if llama.n_tokens + len(tokens) > llama.n_ctx():
                     raise ValueError(
-                        f"Prompt exceeds n_ctx: {llama.n_tokens + len(tokens)} > {llama.n_ctx()}"
+                        f"Prompt exceeds n_ctx: {llama.n_tokens + len(tokens)} > {llama.n_ctx()}",
                     )
                 llama.eval(tokens)
             else:

From 7bded1607b9dd6b89b885250aa32313154b1ec51 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 23:25:44 +0200
Subject: [PATCH 091/177] Lint

---
 llama_cpp/llama_chat_format.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 6968ee08d..a49bf6593 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -2214,7 +2214,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                         }
                     else:
                         func_call_dict = {
-                            "function_call": {"name": function_name, "arguments": ""}
+                            "function_call": {"name": function_name, "arguments": ""},
                         }
                     # Stream function name
                     yield llama_types.CreateChatCompletionStreamResponse(

From cc68145a8eb852407de9a4621a07dd357a16bbad Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 23:43:50 +0200
Subject: [PATCH 092/177] bugfix

---
 llama_cpp/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index f8e749dc1..e0f1632c9 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -2238,7 +2238,7 @@ def __call__(
 
 class StoppingCriteriaList(List[StoppingCriteria]):
     def __call__(
-        self, input_ids: npt.NDArray[np.intc], logits: npt.NDArray[np.single,
+        self, input_ids: npt.NDArray[np.intc], logits: npt.NDArray[np.single],
     ) -> bool:
         return any([stopping_criteria(input_ids, logits) for stopping_criteria in self])
 

From dd962f8834e79494b1dfbbe6bd4182497a0fac78 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Fri, 2 Aug 2024 23:53:45 +0200
Subject: [PATCH 093/177] Lint

---
 llama_cpp/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index e0f1632c9..d23baeed3 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -1380,7 +1380,7 @@ def logit_bias_processor(
                                     "index": 0,
                                     "logprobs": None,
                                     "finish_reason": None,
-                                }
+                                },
                             ],
                         }
 

From 019e5801a18ac3ed50292f6102c92ab46ad0e0c7 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 3 Aug 2024 00:20:06 +0200
Subject: [PATCH 094/177] add C90 rules

---
 .github/workflows/linter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml
index 55738aaf9..86437f155 100644
--- a/.github/workflows/linter.yml
+++ b/.github/workflows/linter.yml
@@ -24,4 +24,4 @@ jobs:
           cache: 'pip'
       - uses: chartboost/ruff-action@v1
         with:
-          args: 'check --select A,ANN,ASYNC,B,C4,COM,DTZ,E,EM,F,I,PERF,NPY,PL,RUF,S,SIM,UP,W,YTT --output-format github --diff --exclude docker,docs,scripts,tests,vendor'
+          args: 'check --select A,ANN,ASYNC,B,C4,C90,COM,DTZ,E,EM,F,I,PERF,NPY,PL,RUF,S,SIM,UP,W,YTT --output-format github --diff --exclude docker,docs,scripts,tests,vendor'

From dc70fb006d5413a214105f3c9f35c9cee094b6f0 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 3 Aug 2024 00:23:12 +0200
Subject: [PATCH 095/177] add ERA and FA rules

---
 .github/workflows/linter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml
index 86437f155..d5b4c0b17 100644
--- a/.github/workflows/linter.yml
+++ b/.github/workflows/linter.yml
@@ -24,4 +24,4 @@ jobs:
           cache: 'pip'
       - uses: chartboost/ruff-action@v1
         with:
-          args: 'check --select A,ANN,ASYNC,B,C4,C90,COM,DTZ,E,EM,F,I,PERF,NPY,PL,RUF,S,SIM,UP,W,YTT --output-format github --diff --exclude docker,docs,scripts,tests,vendor'
+          args: 'check --select A,ANN,ASYNC,B,C4,C90,COM,DTZ,E,EM,ERA,F,FA,I,PERF,NPY,PL,RUF,S,SIM,UP,W,YTT --output-format github --diff --exclude docker,docs,scripts,tests,vendor'

From 63b25f100182e5d3928f274b01e7123277f53996 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 3 Aug 2024 00:25:01 +0200
Subject: [PATCH 096/177] add FAST, FIX and FLY rules

---
 .github/workflows/linter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml
index d5b4c0b17..49da23f29 100644
--- a/.github/workflows/linter.yml
+++ b/.github/workflows/linter.yml
@@ -24,4 +24,4 @@ jobs:
           cache: 'pip'
       - uses: chartboost/ruff-action@v1
         with:
-          args: 'check --select A,ANN,ASYNC,B,C4,C90,COM,DTZ,E,EM,ERA,F,FA,I,PERF,NPY,PL,RUF,S,SIM,UP,W,YTT --output-format github --diff --exclude docker,docs,scripts,tests,vendor'
+          args: 'check --select A,ANN,ASYNC,B,C4,C90,COM,DTZ,E,EM,ERA,F,FA,FAST,FIX,FLY,I,PERF,NPY,PL,RUF,S,SIM,UP,W,YTT --output-format github --diff --exclude docker,docs,scripts,tests,vendor'

From f3f905bd3b2af024174e569f69c467a7da81ad94 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 3 Aug 2024 00:29:25 +0200
Subject: [PATCH 097/177] add AIR, PT and PGH rules

---
 .github/workflows/linter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml
index 49da23f29..ebf5c7906 100644
--- a/.github/workflows/linter.yml
+++ b/.github/workflows/linter.yml
@@ -24,4 +24,4 @@ jobs:
           cache: 'pip'
       - uses: chartboost/ruff-action@v1
         with:
-          args: 'check --select A,ANN,ASYNC,B,C4,C90,COM,DTZ,E,EM,ERA,F,FA,FAST,FIX,FLY,I,PERF,NPY,PL,RUF,S,SIM,UP,W,YTT --output-format github --diff --exclude docker,docs,scripts,tests,vendor'
+          args: 'check --select A,AIR,ANN,ASYNC,B,C4,C90,COM,DTZ,E,EM,ERA,F,FA,FAST,FIX,FLY,G,I,PERF,PGH,PT,NPY,PL,RUF,S,SIM,UP,W,YTT --output-format github --diff --exclude docker,docs,scripts,tests,vendor'

From 17d1a1f4a0c3a5e0f5d504c652f7d6292b0bc714 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 3 Aug 2024 00:30:57 +0200
Subject: [PATCH 098/177] add ARG rules

---
 .github/workflows/linter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml
index ebf5c7906..76d6233e7 100644
--- a/.github/workflows/linter.yml
+++ b/.github/workflows/linter.yml
@@ -24,4 +24,4 @@ jobs:
           cache: 'pip'
       - uses: chartboost/ruff-action@v1
         with:
-          args: 'check --select A,AIR,ANN,ASYNC,B,C4,C90,COM,DTZ,E,EM,ERA,F,FA,FAST,FIX,FLY,G,I,PERF,PGH,PT,NPY,PL,RUF,S,SIM,UP,W,YTT --output-format github --diff --exclude docker,docs,scripts,tests,vendor'
+          args: 'check --select A,AIR,ANN,ARG,ASYNC,B,C4,C90,COM,DTZ,E,EM,ERA,F,FA,FAST,FIX,FLY,G,I,PERF,PGH,PT,NPY,PL,RUF,S,SIM,UP,W,YTT --output-format github --diff --exclude docker,docs,scripts,tests,vendor'

From 7850a9636e4281128653da61262c0b8843c29bb2 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 3 Aug 2024 00:31:34 +0200
Subject: [PATCH 099/177] add BLE rules

---
 .github/workflows/linter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml
index 76d6233e7..50d915da9 100644
--- a/.github/workflows/linter.yml
+++ b/.github/workflows/linter.yml
@@ -24,4 +24,4 @@ jobs:
           cache: 'pip'
       - uses: chartboost/ruff-action@v1
         with:
-          args: 'check --select A,AIR,ANN,ARG,ASYNC,B,C4,C90,COM,DTZ,E,EM,ERA,F,FA,FAST,FIX,FLY,G,I,PERF,PGH,PT,NPY,PL,RUF,S,SIM,UP,W,YTT --output-format github --diff --exclude docker,docs,scripts,tests,vendor'
+          args: 'check --select A,AIR,ANN,ARG,ASYNC,B,BLE,C4,C90,COM,DTZ,E,EM,ERA,F,FA,FAST,FIX,FLY,G,I,PERF,PGH,PT,NPY,PL,RUF,S,SIM,UP,W,YTT --output-format github --diff --exclude docker,docs,scripts,tests,vendor'

From 7d639bd93fb94221861a50d4faaf75a69abd5348 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 3 Aug 2024 00:35:45 +0200
Subject: [PATCH 100/177] add ICN, ISC rules

---
 .github/workflows/linter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml
index 50d915da9..7f309c250 100644
--- a/.github/workflows/linter.yml
+++ b/.github/workflows/linter.yml
@@ -24,4 +24,4 @@ jobs:
           cache: 'pip'
       - uses: chartboost/ruff-action@v1
         with:
-          args: 'check --select A,AIR,ANN,ARG,ASYNC,B,BLE,C4,C90,COM,DTZ,E,EM,ERA,F,FA,FAST,FIX,FLY,G,I,PERF,PGH,PT,NPY,PL,RUF,S,SIM,UP,W,YTT --output-format github --diff --exclude docker,docs,scripts,tests,vendor'
+          args: 'check --select A,AIR,ANN,ARG,ASYNC,B,BLE,C4,C90,COM,DTZ,E,EM,ERA,F,FA,FAST,FBT,FIX,FLY,FURB,G,I,ICN,ISC,PERF,PGH,PT,NPY,PL,RUF,S,SIM,UP,W,YTT --output-format github --diff --exclude docker,docs,scripts,tests,vendor'

From 0d59366330b7ec51a0185e03b3047df52168c2e9 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 3 Aug 2024 00:39:32 +0200
Subject: [PATCH 101/177] Update linter.yml

---
 .github/workflows/linter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml
index 7f309c250..762fb6a85 100644
--- a/.github/workflows/linter.yml
+++ b/.github/workflows/linter.yml
@@ -24,4 +24,4 @@ jobs:
           cache: 'pip'
       - uses: chartboost/ruff-action@v1
         with:
-          args: 'check --select A,AIR,ANN,ARG,ASYNC,B,BLE,C4,C90,COM,DTZ,E,EM,ERA,F,FA,FAST,FBT,FIX,FLY,FURB,G,I,ICN,ISC,PERF,PGH,PT,NPY,PL,RUF,S,SIM,UP,W,YTT --output-format github --diff --exclude docker,docs,scripts,tests,vendor'
+          args: 'check --select A,AIR,ANN,ARG,ASYNC,B,BLE,C4,C90,COM,DTZ,E,EM,ERA,F,FA,FAST,FBT,FIX,FLY,FURB,G,I,ICN,INP,INT,ISC,LOG,N,PERF,PGH,PT,NPY,PL,RUF,S,SIM,UP,W,YTT --output-format github --diff --exclude docker,docs,scripts,tests,vendor'

From 9c2df8b9b89a158e34ab860f258367e1f94e4bb0 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 3 Aug 2024 00:41:20 +0200
Subject: [PATCH 102/177] Update linter.yml

---
 .github/workflows/linter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml
index 762fb6a85..55801422e 100644
--- a/.github/workflows/linter.yml
+++ b/.github/workflows/linter.yml
@@ -24,4 +24,4 @@ jobs:
           cache: 'pip'
       - uses: chartboost/ruff-action@v1
         with:
-          args: 'check --select A,AIR,ANN,ARG,ASYNC,B,BLE,C4,C90,COM,DTZ,E,EM,ERA,F,FA,FAST,FBT,FIX,FLY,FURB,G,I,ICN,INP,INT,ISC,LOG,N,PERF,PGH,PT,NPY,PL,RUF,S,SIM,UP,W,YTT --output-format github --diff --exclude docker,docs,scripts,tests,vendor'
+          args: 'check --preview --select A,AIR,ANN,ARG,ASYNC,B,BLE,C4,C90,COM,DTZ,E,EM,ERA,F,FA,FAST,FBT,FIX,FLY,FURB,G,I,ICN,INP,INT,ISC,LOG,N,PERF,PGH,PT,NPY,PL,RUF,S,SIM,UP,W,YTT --output-format github --diff --exclude docker,docs,scripts,tests,vendor'

From 08692dd7ebc429bcdc577869c7720e4eb742e05d Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 3 Aug 2024 00:48:50 +0200
Subject: [PATCH 103/177] Create fixer.yml

---
 .github/workflows/fixer.yml | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100644 .github/workflows/fixer.yml

diff --git a/.github/workflows/fixer.yml b/.github/workflows/fixer.yml
new file mode 100644
index 000000000..7ea87c527
--- /dev/null
+++ b/.github/workflows/fixer.yml
@@ -0,0 +1,34 @@
+name: Fixer
+
+on: [push, pull_request]
+
+concurrency:
+  group: fixer-${{ github.event_name == 'pull_request' && format('{0}-{1}', github.workflow, github.event.pull_request.number) || github.workflow_ref }}
+  cancel-in-progress: true
+
+jobs:
+  ruff-lint:
+    name: Ruff
+    runs-on: ubuntu-latest
+    permissions:
+      # Give the default GITHUB_TOKEN write permission to commit and push the
+      # added or changed files to the repository.
+      contents: write
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ github.head_ref }}
+  
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+          cache: 'pip'
+      
+      - uses: chartboost/ruff-action@v1
+        with:
+          args: 'check --fix-only'
+
+      - uses: stefanzweifel/git-auto-commit-action@v5
+        with:
+          commit_message: 'style fixes by ruff'

From 069fb9af55e1db3ac50710bbefa29c0d94442eaa Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 3 Aug 2024 00:51:57 +0200
Subject: [PATCH 104/177] Update fixer.yml

---
 .github/workflows/fixer.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/fixer.yml b/.github/workflows/fixer.yml
index 7ea87c527..3c451f05a 100644
--- a/.github/workflows/fixer.yml
+++ b/.github/workflows/fixer.yml
@@ -27,7 +27,7 @@ jobs:
       
       - uses: chartboost/ruff-action@v1
         with:
-          args: 'check --fix-only'
+          args: 'check --preview --fix-only'
 
       - uses: stefanzweifel/git-auto-commit-action@v5
         with:

From 53f7b08c3e29a5e2b5205b6ddd041114aac74aa0 Mon Sep 17 00:00:00 2001
From: Smartappli <Smartappli@users.noreply.github.com>
Date: Fri, 2 Aug 2024 22:52:15 +0000
Subject: [PATCH 105/177] style fixes by ruff

---
 examples/notebooks/Batching.ipynb | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/notebooks/Batching.ipynb b/examples/notebooks/Batching.ipynb
index 73b28c744..26ef05f17 100644
--- a/examples/notebooks/Batching.ipynb
+++ b/examples/notebooks/Batching.ipynb
@@ -567,7 +567,6 @@
     }
    ],
    "source": [
-    "import ctypes\n",
     "\n",
     "streams = [\"\"] * n_parallel\n",
     "i_batch = [batch.n_tokens - 1] * n_parallel\n",

From 02b87f3ffee292cdcd57b4e0e79b1647f3ed8911 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 3 Aug 2024 01:13:22 +0200
Subject: [PATCH 106/177] Update linter.yml

---
 .github/workflows/linter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml
index 55801422e..762fb6a85 100644
--- a/.github/workflows/linter.yml
+++ b/.github/workflows/linter.yml
@@ -24,4 +24,4 @@ jobs:
           cache: 'pip'
       - uses: chartboost/ruff-action@v1
         with:
-          args: 'check --preview --select A,AIR,ANN,ARG,ASYNC,B,BLE,C4,C90,COM,DTZ,E,EM,ERA,F,FA,FAST,FBT,FIX,FLY,FURB,G,I,ICN,INP,INT,ISC,LOG,N,PERF,PGH,PT,NPY,PL,RUF,S,SIM,UP,W,YTT --output-format github --diff --exclude docker,docs,scripts,tests,vendor'
+          args: 'check --select A,AIR,ANN,ARG,ASYNC,B,BLE,C4,C90,COM,DTZ,E,EM,ERA,F,FA,FAST,FBT,FIX,FLY,FURB,G,I,ICN,INP,INT,ISC,LOG,N,PERF,PGH,PT,NPY,PL,RUF,S,SIM,UP,W,YTT --output-format github --diff --exclude docker,docs,scripts,tests,vendor'

From 6679172cca51cd35a6e800f38a77cf149fee232a Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 3 Aug 2024 01:17:56 +0200
Subject: [PATCH 107/177] Delete .github/workflows/fixer.yml

---
 .github/workflows/fixer.yml | 34 ----------------------------------
 1 file changed, 34 deletions(-)
 delete mode 100644 .github/workflows/fixer.yml

diff --git a/.github/workflows/fixer.yml b/.github/workflows/fixer.yml
deleted file mode 100644
index 3c451f05a..000000000
--- a/.github/workflows/fixer.yml
+++ /dev/null
@@ -1,34 +0,0 @@
-name: Fixer
-
-on: [push, pull_request]
-
-concurrency:
-  group: fixer-${{ github.event_name == 'pull_request' && format('{0}-{1}', github.workflow, github.event.pull_request.number) || github.workflow_ref }}
-  cancel-in-progress: true
-
-jobs:
-  ruff-lint:
-    name: Ruff
-    runs-on: ubuntu-latest
-    permissions:
-      # Give the default GITHUB_TOKEN write permission to commit and push the
-      # added or changed files to the repository.
-      contents: write
-
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          ref: ${{ github.head_ref }}
-  
-      - uses: actions/setup-python@v5
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-      
-      - uses: chartboost/ruff-action@v1
-        with:
-          args: 'check --preview --fix-only'
-
-      - uses: stefanzweifel/git-auto-commit-action@v5
-        with:
-          commit_message: 'style fixes by ruff'

From 35533fd73b76f9d6a1cde0d1f2364ab088ccf47f Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 3 Aug 2024 14:09:23 +0200
Subject: [PATCH 108/177] add new rules

---
 .github/workflows/linter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml
index 762fb6a85..aaeeeb161 100644
--- a/.github/workflows/linter.yml
+++ b/.github/workflows/linter.yml
@@ -24,4 +24,4 @@ jobs:
           cache: 'pip'
       - uses: chartboost/ruff-action@v1
         with:
-          args: 'check --select A,AIR,ANN,ARG,ASYNC,B,BLE,C4,C90,COM,DTZ,E,EM,ERA,F,FA,FAST,FBT,FIX,FLY,FURB,G,I,ICN,INP,INT,ISC,LOG,N,PERF,PGH,PT,NPY,PL,RUF,S,SIM,UP,W,YTT --output-format github --diff --exclude docker,docs,scripts,tests,vendor'
+          args: 'check --select A,AIR,ANN,ARG,ASYNC,B,BLE,C4,C90,COM,DTZ,E,EM,ERA,F,FA,FAST,FBT,FIX,FLY,FURB,G,I,ICN,INP,INT,ISC,LOG,N,NPY,PERF,PGH,PIE,PL,PT,PTH,PYI,Q,RUF,S,SIM,UP,W,YTT --output-format github --diff --exclude docker,docs,scripts,tests,vendor'

From ee5951d097f4e79a295ebfb1a33b860026fb08ab Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 3 Aug 2024 14:14:15 +0200
Subject: [PATCH 109/177] add more rules

---
 .github/workflows/linter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml
index aaeeeb161..4fc91d690 100644
--- a/.github/workflows/linter.yml
+++ b/.github/workflows/linter.yml
@@ -24,4 +24,4 @@ jobs:
           cache: 'pip'
       - uses: chartboost/ruff-action@v1
         with:
-          args: 'check --select A,AIR,ANN,ARG,ASYNC,B,BLE,C4,C90,COM,DTZ,E,EM,ERA,F,FA,FAST,FBT,FIX,FLY,FURB,G,I,ICN,INP,INT,ISC,LOG,N,NPY,PERF,PGH,PIE,PL,PT,PTH,PYI,Q,RUF,S,SIM,UP,W,YTT --output-format github --diff --exclude docker,docs,scripts,tests,vendor'
+          args: 'check --select A,AIR,ANN,ARG,ASYNC,B,BLE,C4,C90,COM,DTZ,E,EM,ERA,F,FA,FAST,FBT,FIX,FLY,FURB,G,I,ICN,INP,INT,ISC,LOG,N,NPY,PERF,PGH,PIE,PL,PT,PTH,PYI,Q,R,RET,RSE,RUF,S,SIM,SLF,SLOT,T10,T20,UP,W,YTT --output-format github --diff --exclude docker,docs,scripts,tests,vendor'

From f8b8f41c1eb32d325443384a424cab25a737128d Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 3 Aug 2024 14:17:42 +0200
Subject: [PATCH 110/177] Lint

---
 llama_cpp/llama.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index d23baeed3..eb8a8a6ab 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -2070,12 +2070,12 @@ def pooling_type(self) -> str:
 
     def close(self) -> None:
         """Explicitly free the model from memory."""
-        if hasattr(self,'_stack'):
+        if hasattr(self,"_stack"):
             if self._stack is not None:
                 self._stack.close()
 
     def __del__(self) -> None:
-        if hasattr(self,'_lora_adapter'):
+        if hasattr(self,"_lora_adapter"):
             if self._lora_adapter is not None:
                 llama_cpp.llama_lora_adapter_free(self._lora_adapter)
         self.close()

From cf095ee2e10a473c6e85091953d8cfa58e8fb86e Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 3 Aug 2024 14:21:02 +0200
Subject: [PATCH 111/177] Lint

---
 llama_cpp/llama_speculative.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama_speculative.py b/llama_cpp/llama_speculative.py
index cfb69bfac..6950c5a64 100644
--- a/llama_cpp/llama_speculative.py
+++ b/llama_cpp/llama_speculative.py
@@ -10,7 +10,7 @@ class LlamaDraftModel(abc.ABC):
     def __call__(
         self, input_ids: npt.NDArray[np.intc], /, **kwargs: Any,
     ) -> npt.NDArray[np.intc]:
-        raise NotImplementedError()
+        raise NotImplementedError
 
 
 class LlamaPromptLookupDecoding(LlamaDraftModel):

From e86d973b4736652896c31a87a85e446b6b1af69a Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 3 Aug 2024 14:39:27 +0200
Subject: [PATCH 112/177] Lint

---
 llama_cpp/llama_cpp.py | 25 -------------------------
 1 file changed, 25 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 21799bcc9..bed241aea 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -141,7 +141,6 @@ def decorator(f: F) -> F:
 
 def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCData]:
     """Type-annotated version of ctypes.byref"""
-    ...
 
 
 byref = ctypes.byref  # type: ignore
@@ -1123,7 +1122,6 @@ class llama_chat_message(ctypes.Structure):
 )
 def llama_model_default_params() -> llama_model_params:
     """Get default parameters for llama_model"""
-    ...
 
 
 # LLAMA_API struct llama_context_params llama_context_default_params(void);
@@ -1134,7 +1132,6 @@ def llama_model_default_params() -> llama_model_params:
 )
 def llama_context_default_params() -> llama_context_params:
     """Get default parameters for llama_context"""
-    ...
 
 
 # LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
@@ -1145,7 +1142,6 @@ def llama_context_default_params() -> llama_context_params:
 )
 def llama_model_quantize_default_params() -> llama_model_quantize_params:
     """Get default parameters for llama_model_quantize"""
-    ...
 
 
 # // Initialize the llama + ggml backend
@@ -1162,7 +1158,6 @@ def llama_backend_init():
     """Initialize the llama + ggml backend
     If numa is true, use NUMA optimizations
     Call once at the start of the program"""
-    ...
 
 
 # // numa strategies
@@ -1202,7 +1197,6 @@ def llama_numa_init(numa: int, /):
 )
 def llama_backend_free():
     """Call once at the end of the program - currently only used for MPI"""
-    ...
 
 
 # LLAMA_API struct llama_model * llama_load_model_from_file(
@@ -1252,7 +1246,6 @@ def llama_new_context_with_model(
 )
 def llama_free(ctx: llama_context_p, /):
     """Frees all allocated memory"""
-    ...
 
 
 # LLAMA_API int64_t llama_time_us(void);
@@ -1366,7 +1359,6 @@ def llama_n_layer(model: llama_model_p, /) -> int:
 @ctypes_function("llama_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float)
 def llama_rope_freq_scale_train(model: llama_model_p, /) -> float:
     """Get the model's RoPE frequency scaling factor"""
-    ...
 
 
 # // Functions to access the model's GGUF metadata scalar values
@@ -1395,7 +1387,6 @@ def llama_model_meta_val_str(
     /,
 ) -> int:
     """Get metadata value as a string by key name"""
-    ...
 
 
 # // Get the number of metadata key/value pairs
@@ -1403,7 +1394,6 @@ def llama_model_meta_val_str(
 @ctypes_function("llama_model_meta_count", [llama_model_p_ctypes], ctypes.c_int32)
 def llama_model_meta_count(model: llama_model_p, /) -> int:
     """Get the number of metadata key/value pairs"""
-    ...
 
 
 # // Get metadata key name by index
@@ -1426,7 +1416,6 @@ def llama_model_meta_key_by_index(
     /,
 ) -> int:
     """Get metadata key name by index"""
-    ...
 
 
 # // Get metadata value as a string by index
@@ -1449,7 +1438,6 @@ def llama_model_meta_val_str_by_index(
     /,
 ) -> int:
     """Get metadata value as a string by index"""
-    ...
 
 
 # // Get a string describing the model type
@@ -1466,7 +1454,6 @@ def llama_model_desc(
     /,
 ) -> int:
     """Get a string describing the model type"""
-    ...
 
 
 # // Returns the total size of all the tensors in the model in bytes
@@ -1474,7 +1461,6 @@ def llama_model_desc(
 @ctypes_function("llama_model_size", [llama_model_p_ctypes], ctypes.c_uint64)
 def llama_model_size(model: llama_model_p, /) -> int:
     """Returns the total size of all the tensors in the model in bytes"""
-    ...
 
 
 # // Returns the total number of parameters in the model
@@ -1482,7 +1468,6 @@ def llama_model_size(model: llama_model_p, /) -> int:
 @ctypes_function("llama_model_n_params", [llama_model_p_ctypes], ctypes.c_uint64)
 def llama_model_n_params(model: llama_model_p, /) -> int:
     """Returns the total number of parameters in the model"""
-    ...
 
 
 # // Get a llama model tensor
@@ -1494,7 +1479,6 @@ def llama_get_model_tensor(
     model: llama_model_p, name: Union[ctypes.c_char_p, bytes], /,
 ) -> ctypes.c_void_p:
     """Get a llama model tensor"""
-    ...
 
 
 # // Returns true if the model contains an encoder that requires llama_encode() call
@@ -1502,7 +1486,6 @@ def llama_get_model_tensor(
 @ctypes_function("llama_model_has_encoder", [llama_model_p_ctypes], ctypes.c_bool)
 def llama_model_has_encoder(model: llama_model_p, /) -> bool:
     """Returns true if the model contains an encoder that requires llama_encode() call"""
-    ...
 
 
 # // For encoder-decoder models, this function returns id of the token that must be provided
@@ -1515,7 +1498,6 @@ def llama_model_decoder_start_token(model: llama_model_p, /) -> int:
     """For encoder-decoder models, this function returns id of the token that must be provided
     to the decoder to start generating output sequence. For other models, it returns -1.
     """
-    ...
 
 
 # // Returns 0 on success
@@ -1539,7 +1521,6 @@ def llama_model_quantize(
     /,
 ) -> int:
     """Returns 0 on success"""
-    ...
 
 
 # // Load a LoRA adapter from file
@@ -1558,7 +1539,6 @@ def llama_lora_adapter_init(
     """Load a LoRA adapter from file
     The loaded adapter will be associated to the given model, and will be free when the model is deleted
     """
-    ...
 
 
 # // Add a loaded LoRA adapter to given context
@@ -1577,7 +1557,6 @@ def llama_lora_adapter_set(
 ) -> int:
     """Add a loaded LoRA adapter to given context
     This will not modify model's weight"""
-    ...
 
 
 # // Remove a specific LoRA adapter from given context
@@ -1595,7 +1574,6 @@ def llama_lora_adapter_remove(
 ) -> int:
     """Remove a LoRA adapter from given context
     Return -1 if the adapter is not present in the context"""
-    ...
 
 
 # // Remove all LoRA adapters from given context
@@ -1608,7 +1586,6 @@ def llama_lora_adapter_remove(
 )
 def llama_lora_adapter_clear(ctx: llama_context_p, /):
     """Remove all LoRA adapters from given context"""
-    ...
 
 
 # // Manually free a LoRA adapter
@@ -1622,7 +1599,6 @@ def llama_lora_adapter_clear(ctx: llama_context_p, /):
 def llama_lora_adapter_free(adapter: llama_lora_adapter_p, /):
     """Manually free a LoRA adapter
     Note: loaded adapters will be free when the associated model is deleted"""
-    ...
 
 
 # // Apply a loaded control vector to a llama_context, or if data is NULL, clear
@@ -1665,7 +1641,6 @@ def llama_control_vector_apply(
     to an n_embd x n_layers buffer starting from layer 1.
     il_start and il_end are the layer range the vector should apply to (both inclusive)
     See llama_control_vector_load in common to load a control vector."""
-    ...
 
 
 # //

From 20fad2772103c9db3ed1ad6cf7a0c864f1968422 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 3 Aug 2024 14:56:45 +0200
Subject: [PATCH 113/177] Lint

---
 llama_cpp/llama_cpp.py | 36 ------------------------------------
 1 file changed, 36 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index bed241aea..c8a4e3110 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -1736,7 +1736,6 @@ def llama_kv_cache_view_init(
     ctx: llama_context_p, n_seq_max: Union[ctypes.c_int32, int], /,
 ) -> llama_kv_cache_view:
     """Create an empty KV cache view. (use only for debugging purposes)"""
-    ...
 
 
 # // Free a KV cache view. (use only for debugging purposes)
@@ -1744,7 +1743,6 @@ def llama_kv_cache_view_init(
 @ctypes_function("llama_kv_cache_view_free", [llama_kv_cache_view_p], None)
 def llama_kv_cache_view_free(view: ctypes.pointer[llama_kv_cache_view], /):  # type: ignore
     """Free a KV cache view. (use only for debugging purposes)"""
-    ...
 
 
 # // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
@@ -1754,7 +1752,6 @@ def llama_kv_cache_view_free(view: ctypes.pointer[llama_kv_cache_view], /):  # t
 )
 def llama_kv_cache_view_update(ctx: llama_context_p, view: CtypesPointerOrRef[llama_kv_cache_view], /):  # type: ignore
     """Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)"""
-    ...
 
 
 # // Returns the number of tokens in the KV cache (slow, use only for debug)
@@ -1767,7 +1764,6 @@ def llama_get_kv_cache_token_count(ctx: llama_context_p, /) -> int:
     """Returns the number of tokens in the KV cache (slow, use only for debug)
     If a KV cell has multiple sequences assigned to it, it will be counted multiple times
     """
-    ...
 
 
 # // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
@@ -1777,7 +1773,6 @@ def llama_get_kv_cache_token_count(ctx: llama_context_p, /) -> int:
 )
 def llama_get_kv_cache_used_cells(ctx: llama_context_p, /) -> int:
     """Returns the number of used KV cells (i.e. have at least one sequence assigned to them)"""
-    ...
 
 
 # // Clear the KV cache - both cell info is erased and KV data is zeroed
@@ -1786,7 +1781,6 @@ def llama_get_kv_cache_used_cells(ctx: llama_context_p, /) -> int:
 @ctypes_function("llama_kv_cache_clear", [llama_context_p_ctypes], None)
 def llama_kv_cache_clear(ctx: llama_context_p, /):
     """Clear the KV cache"""
-    ...
 
 
 # // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
@@ -1823,7 +1817,6 @@ def llama_kv_cache_seq_rm(
     seq_id < 0 : match any sequence
     p0 < 0     : [0,  p1]
     p1 < 0     : [p0, inf)"""
-    ...
 
 
 # // Copy all tokens that belong to the specified sequence to another sequence
@@ -1859,7 +1852,6 @@ def llama_kv_cache_seq_cp(
     Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
     p0 < 0 : [0,  p1]
     p1 < 0 : [p0, inf)"""
-    ...
 
 
 # // Removes all tokens that do not belong to the specified sequence
@@ -1871,7 +1863,6 @@ def llama_kv_cache_seq_cp(
 )
 def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /):
     """Removes all tokens that do not belong to the specified sequence"""
-    ...
 
 
 # // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
@@ -1947,7 +1938,6 @@ def llama_kv_cache_seq_div(
     If the KV cache is RoPEd, the KV data is updated accordingly
     p0 < 0 : [0,  p1]
     p1 < 0 : [p0, inf)"""
-    ...
 
 
 # // Defragment the KV cache
@@ -1961,7 +1951,6 @@ def llama_kv_cache_defrag(ctx: llama_context_p, /):
     This will be applied:
     - lazily on next llama_decode()
     - explicitly with llama_kv_cache_update()"""
-    ...
 
 
 # // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
@@ -1969,7 +1958,6 @@ def llama_kv_cache_defrag(ctx: llama_context_p, /):
 @ctypes_function("llama_kv_cache_update", [llama_context_p_ctypes], None)
 def llama_kv_cache_update(ctx: llama_context_p, /):
     """Apply the KV cache updates (such as K-shifts, defragmentation, etc.)"""
-    ...
 
 
 # //
@@ -1984,7 +1972,6 @@ def llama_kv_cache_update(ctx: llama_context_p, /):
 @ctypes_function("llama_state_get_size", [llama_context_p_ctypes], ctypes.c_size_t)
 def llama_state_get_size(ctx: llama_context_p, /) -> int:
     """Returns the *actual* size in bytes of the state (rng, logits, embedding and kv_cache) - will often be smaller after compacting tokens"""
-    ...
 
 
 # LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx),
@@ -1993,7 +1980,6 @@ def llama_state_get_size(ctx: llama_context_p, /) -> int:
 def llama_get_state_size(ctx: llama_context_p, /) -> int:
     """Returns the maximum size in bytes of the state (rng, logits, embedding
     and kv_cache) - will often be smaller after compacting tokens"""
-    ...
 
 
 # // Copies the state to the specified destination address.
@@ -2021,7 +2007,6 @@ def llama_state_get_data(
     """Copies the state to the specified destination address.
     Destination needs to have allocated enough memory.
     Returns the number of bytes copied"""
-    ...
 
 
 # LLAMA_API DEPRECATED(size_t llama_copy_state_data(
@@ -2042,7 +2027,6 @@ def llama_copy_state_data(
     """Copies the state to the specified destination address.
     Destination needs to have allocated enough memory.
     Returns the number of bytes copied"""
-    ...
 
 
 # // Set the state reading from the specified address
@@ -2064,7 +2048,6 @@ def llama_state_set_data(
 ) -> int:
     """Set the state reading from the specified address
     Returns the number of bytes read"""
-    ...
 
 
 # LLAMA_API DEPRECATED(size_t llama_set_state_data(
@@ -2080,7 +2063,6 @@ def llama_set_state_data(
     ctx: llama_context_p, src: CtypesArray[ctypes.c_uint8], /,
 ) -> int:
     """Set the state reading from the specified address"""
-    ...
 
 
 # Save/load session file
@@ -2203,7 +2185,6 @@ def llama_save_session_file(
 )
 def llama_state_seq_get_size(ctx: llama_context_p, seq_id: llama_seq_id, /) -> int:
     """Get the exact size needed to copy the KV cache of a single sequence"""
-    ...
 
 
 # // Copy the KV cache of a single sequence into the specified buffer
@@ -2260,7 +2241,6 @@ def llama_state_seq_set_data(
     /,
 ) -> int:
     """Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence"""
-    ...
 
 
 # LLAMA_API size_t llama_state_seq_save_file(
@@ -2357,7 +2337,6 @@ def llama_batch_get_one(
 
     NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
     """
-    ...
 
 
 # // Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
@@ -2387,7 +2366,6 @@ def llama_batch_init(
     Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
     The rest of the llama_batch members are allocated with size n_tokens
     All members are left uninitialized"""
-    ...
 
 
 # // Frees a batch of tokens allocated with llama_batch_init()
@@ -2395,7 +2373,6 @@ def llama_batch_init(
 @ctypes_function("llama_batch_free", [llama_batch], None)
 def llama_batch_free(batch: llama_batch, /):
     """Frees a batch of tokens allocated with llama_batch_init()"""
-    ...
 
 
 # // Processes a batch of tokens with the ecoder part of the encoder-decoder model.
@@ -2411,7 +2388,6 @@ def llama_encode(ctx: llama_context_p, batch: llama_batch, /) -> int:
     Stores the encoder output internally for later use by the decoder cross-attention layers.
     0 - success
     < 0 - error"""
-    ...
 
 
 # // Positive return values does not mean a fatal error, but rather a warning.
@@ -2427,7 +2403,6 @@ def llama_decode(ctx: llama_context_p, batch: llama_batch, /) -> int:
     0 - success
     1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
     < 0 - error"""
-    ...
 
 
 # // Set the number of threads used for decoding
@@ -2453,7 +2428,6 @@ def llama_set_n_threads(
     n_threads is the number of threads used for generation (single token)
     n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
     """
-    ...
 
 
 # // Get the number of threads used for generation of a single token.
@@ -2461,7 +2435,6 @@ def llama_set_n_threads(
 @ctypes_function("llama_n_threads", [llama_context_p_ctypes], ctypes.c_uint32)
 def llama_n_threads(ctx: llama_context_p, /) -> int:
     """Get the number of threads used for generation of a single token"""
-    ...
 
 
 # // Get the number of threads used for prompt and batch processing (multiple token).
@@ -2469,7 +2442,6 @@ def llama_n_threads(ctx: llama_context_p, /) -> int:
 @ctypes_function("llama_n_threads_batch", [llama_context_p_ctypes], ctypes.c_uint32)
 def llama_n_threads_batch(ctx: llama_context_p, /) -> int:
     """Get the number of threads used for prompt and batch processing (multiple token)"""
-    ...
 
 
 # // Set whether the model is in embeddings mode or not
@@ -2479,7 +2451,6 @@ def llama_n_threads_batch(ctx: llama_context_p, /) -> int:
 def llama_set_embeddings(ctx: llama_context_p, embeddings: bool, /):
     """Set whether the model is in embeddings model or not
     If true, embeddings will be returned but logits will not"""
-    ...
 
 
 # // Set whether to use causal attention or not
@@ -2489,7 +2460,6 @@ def llama_set_embeddings(ctx: llama_context_p, embeddings: bool, /):
 def llama_set_causal_attn(ctx: llama_context_p, causal_attn: bool, /):
     """Set whether to use causal attention or not
     If set to true, the model will only attend to the past tokens"""
-    ...
 
 
 # // Set abort callback
@@ -2506,7 +2476,6 @@ def llama_set_abort_callback(
     /,
 ):
     """Set abort callback"""
-    ...
 
 
 # // Wait until all computations are finished
@@ -2518,7 +2487,6 @@ def llama_synchronize(ctx: llama_context_p, /):
     """Wait until all computations are finished
     This is automatically done when using one of the functions below to obtain the computation results
     and is not necessary to call it explicitly in most cases"""
-    ...
 
 
 # // Token logits obtained from the last call to llama_decode()
@@ -2539,7 +2507,6 @@ def llama_get_logits(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]:
 
     Returns:
         Pointer to the logits buffer of shape (n_tokens, n_vocab)"""
-    ...
 
 
 # // Logits for the ith token. For positive indices, Equivalent to:
@@ -2557,7 +2524,6 @@ def llama_get_logits_ith(
 ) -> CtypesArray[ctypes.c_float]:
     """Logits for the ith token. Equivalent to:
     llama_get_logits(ctx) + i*n_vocab"""
-    ...
 
 
 # // Get all output token embeddings.
@@ -2573,7 +2539,6 @@ def llama_get_logits_ith(
 def llama_get_embeddings(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]:
     """Get the embeddings for the input
     shape: [n_embd] (1-dimensional)"""
-    ...
 
 
 # // Get the embeddings for the ith token. For positive indices, Equivalent to:
@@ -2592,7 +2557,6 @@ def llama_get_embeddings_ith(
 ) -> CtypesArray[ctypes.c_float]:
     """Get the embeddings for the ith sequence
     llama_get_embeddings(ctx) + i*n_embd"""
-    ...
 
 
 # // Get the embeddings for a sequence id

From 5428cce583d3306c33134c010ad498a6ebd9ed53 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 3 Aug 2024 20:24:58 +0200
Subject: [PATCH 114/177] Lint

---
 llama_cpp/llama_cpp.py | 44 ------------------------------------------
 1 file changed, 44 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index c8a4e3110..8c6a51bfd 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -1902,7 +1902,6 @@ def llama_kv_cache_seq_add(
     - explicitly with llama_kv_cache_update()
     p0 < 0 : [0,  p1]
     p1 < 0 : [p0, inf)"""
-    ...
 
 
 # // Integer division of the positions by factor of `d > 1`
@@ -2211,7 +2210,6 @@ def llama_state_seq_get_data(
     /,
 ) -> int:
     """Copy the KV cache of a single sequence into the specified buffer"""
-    ...
 
 
 # // Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence
@@ -2574,7 +2572,6 @@ def llama_get_embeddings_seq(
     """Get the embeddings for a sequence id
     Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
     shape: [n_embd] (1-dimensional)"""
-    ...
 
 
 # //
@@ -2619,7 +2616,6 @@ def llama_token_get_attr(
 )
 def llama_token_is_eog(model: llama_model_p, token: Union[llama_token, int], /) -> bool:
     """Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)"""
-    ...
 
 
 # // Identify if Token Id is a control token or a render-able token
@@ -2631,7 +2627,6 @@ def llama_token_is_control(
     model: llama_model_p, token: Union[llama_token, int], /,
 ) -> bool:
     """Identify if Token Id is a control token or a render-able token"""
-    ...
 
 
 # // Special tokens
@@ -2641,35 +2636,30 @@ def llama_token_is_control(
 @ctypes_function("llama_token_bos", [llama_model_p_ctypes], llama_token)
 def llama_token_bos(model: llama_model_p, /) -> int:
     """beginning-of-sentence"""
-    ...
 
 
 # LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
 @ctypes_function("llama_token_eos", [llama_model_p_ctypes], llama_token)
 def llama_token_eos(model: llama_model_p, /) -> int:
     """end-of-sentence"""
-    ...
 
 
 # LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
 @ctypes_function("llama_token_cls", [llama_model_p_ctypes], llama_token)
 def llama_token_cls(model: llama_model_p, /) -> int:
     """classification"""
-    ...
 
 
 # LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
 @ctypes_function("llama_token_sep", [llama_model_p_ctypes], llama_token)
 def llama_token_sep(model: llama_model_p, /) -> int:
     """sentence separator"""
-    ...
 
 
 # LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
 @ctypes_function("llama_token_nl", [llama_model_p_ctypes], llama_token)
 def llama_token_nl(model: llama_model_p, /) -> int:
     """next-line"""
-    ...
 
 
 # // Returns -1 if unknown, 1 for true or 0 for false.
@@ -2677,7 +2667,6 @@ def llama_token_nl(model: llama_model_p, /) -> int:
 @ctypes_function("llama_add_bos_token", [llama_model_p_ctypes], ctypes.c_int32)
 def llama_add_bos_token(model: llama_model_p, /) -> int:
     """Returns -1 if unknown, 1 for true or 0 for false."""
-    ...
 
 
 # // Returns -1 if unknown, 1 for true or 0 for false.
@@ -2685,7 +2674,6 @@ def llama_add_bos_token(model: llama_model_p, /) -> int:
 @ctypes_function("llama_add_eos_token", [llama_model_p_ctypes], ctypes.c_int32)
 def llama_add_eos_token(model: llama_model_p, /) -> int:
     """Returns -1 if unknown, 1 for true or 0 for false."""
-    ...
 
 
 # // Codellama infill tokens
@@ -2693,7 +2681,6 @@ def llama_add_eos_token(model: llama_model_p, /) -> int:
 @ctypes_function("llama_token_prefix", [llama_model_p_ctypes], llama_token)
 def llama_token_prefix(model: llama_model_p) -> int:
     """codellama infill tokens"""
-    ...
 
 
 # LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
@@ -2772,7 +2759,6 @@ def llama_tokenize(
         Returns the number of tokens on success, no more than n_tokens_max
         Returns a negative number on failure - the number of tokens that would have been returned
     """
-    ...
 
 
 # // Token Id -> Piece.
@@ -2820,7 +2806,6 @@ def llama_token_to_piece(
         length: The length of the buffer.
         lstrip: The number of leading spaces to skip.
         special: If true, special tokens are rendered in the output."""
-    ...
 
 
 # /// @details Convert the provided tokens into text (inverse of llama_tokenize()).
@@ -2870,7 +2855,6 @@ def llama_detokenize(
         text_len_max: The length of the buffer.
         remove_special: Allow to remove BOS and EOS tokens if model is configured to do so.
         unparse_special: If true, special tokens are rendered in the output."""
-    ...
 
 
 # //
@@ -2943,7 +2927,6 @@ def llama_grammar_init(
     /,
 ) -> llama_grammar_p:
     """Initialize a grammar from a set of rules."""
-    ...
 
 
 # LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
@@ -2954,7 +2937,6 @@ def llama_grammar_init(
 )
 def llama_grammar_free(grammar: llama_grammar_p, /):
     """Free a grammar."""
-    ...
 
 
 # LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
@@ -2965,7 +2947,6 @@ def llama_grammar_free(grammar: llama_grammar_p, /):
 )
 def llama_grammar_copy(grammar: llama_grammar_p, /) -> llama_grammar_p:
     """Copy a grammar."""
-    ...
 
 
 # /// @details Apply constraints from grammar
@@ -2991,7 +2972,6 @@ def llama_grammar_sample(
     /,
 ):
     """Apply constraints from grammar"""
-    ...
 
 
 # LLAMA_API DEPRECATED(void llama_sample_grammar(
@@ -3018,7 +2998,6 @@ def llama_sample_grammar(
         candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
         grammar: A grammar object containing the rules and constraints to apply to the generated text.
     """
-    ...
 
 
 # /// @details Accepts the sampled token into the grammar
@@ -3038,7 +3017,6 @@ def llama_grammar_accept_token(
     /,
 ):
     """Accepts the sampled token into the grammar"""
-    ...
 
 
 # //
@@ -3055,7 +3033,6 @@ def llama_grammar_accept_token(
 )
 def llama_set_rng_seed(ctx: llama_context_p, seed: Union[ctypes.c_uint32, int], /):
     """Sets the current rng seed."""
-    ...
 
 
 # /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
@@ -3096,7 +3073,6 @@ def llama_sample_repetition_penalties(
     """Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
     Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
     """
-    ...
 
 
 # /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
@@ -3126,7 +3102,6 @@ def llama_sample_apply_guidance(
     /,
 ):
     """Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806"""
-    ...
 
 
 # /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
@@ -3146,7 +3121,6 @@ def llama_sample_softmax(
     /,
 ):
     """Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits."""
-    ...
 
 
 # /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
@@ -3170,7 +3144,6 @@ def llama_sample_top_k(
     /,
 ):
     """Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751"""
-    ...
 
 
 # /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
@@ -3194,7 +3167,6 @@ def llama_sample_top_p(
     /,
 ):
     """Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751"""
-    ...
 
 
 # /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
@@ -3218,7 +3190,6 @@ def llama_sample_min_p(
     /,
 ):
     """Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841"""
-    ...
 
 
 # /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
@@ -3242,7 +3213,6 @@ def llama_sample_tail_free(
     /,
 ):
     """Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/."""
-    ...
 
 
 # /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
@@ -3266,7 +3236,6 @@ def llama_sample_typical(
     /,
 ):
     """Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666."""
-    ...
 
 
 # /// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
@@ -3298,7 +3267,6 @@ def llama_sample_entropy(
     /,
 ):
     """Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772."""
-    ...
 
 
 # LLAMA_API void llama_sample_temp(
@@ -3324,7 +3292,6 @@ def llama_sample_temp(
         candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
         temp: The temperature value to use for the sampling. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
     """
-    ...
 
 
 # /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
@@ -3372,7 +3339,6 @@ def llama_sample_token_mirostat(
         m: The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
         mu: Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
     """
-    ...
 
 
 # /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
@@ -3415,7 +3381,6 @@ def llama_sample_token_mirostat_v2(
         eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
         mu: Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
     """
-    ...
 
 
 # /// @details Selects the token with the highest probability.
@@ -3436,7 +3401,6 @@ def llama_sample_token_greedy(
     /,
 ) -> int:
     """Selects the token with the highest probability."""
-    ...
 
 
 # /// @details Randomly selects a token from the candidates based on their probabilities using the RNG of ctx.
@@ -3456,7 +3420,6 @@ def llama_sample_token(
     /,
 ) -> int:
     """Randomly selects a token from the candidates based on their probabilities."""
-    ...
 
 
 # //
@@ -3482,7 +3445,6 @@ def llama_split_path(
     /,
 ) -> int:
     """Build a split GGUF final path for this chunk."""
-    ...
 
 
 # /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
@@ -3503,7 +3465,6 @@ def llama_split_prefix(
     /,
 ) -> int:
     """Extract the path prefix from the split_path if and only if the split_no and split_count match."""
-    ...
 
 
 # Performance information
@@ -3517,7 +3478,6 @@ def llama_split_prefix(
 )
 def llama_get_timings(ctx: llama_context_p, /) -> llama_timings:
     """Get performance information"""
-    ...
 
 
 # LLAMA_API void llama_print_timings(struct llama_context * ctx);
@@ -3528,7 +3488,6 @@ def llama_get_timings(ctx: llama_context_p, /) -> llama_timings:
 )
 def llama_print_timings(ctx: llama_context_p, /):
     """Print performance information"""
-    ...
 
 
 # LLAMA_API void llama_reset_timings(struct llama_context * ctx);
@@ -3539,7 +3498,6 @@ def llama_print_timings(ctx: llama_context_p, /):
 )
 def llama_reset_timings(ctx: llama_context_p, /):
     """Reset performance information"""
-    ...
 
 
 # Print system information
@@ -3551,7 +3509,6 @@ def llama_reset_timings(ctx: llama_context_p, /):
 )
 def llama_print_system_info() -> bytes:
     """Print system information"""
-    ...
 
 
 # NOTE: THIS IS CURRENTLY BROKEN AS ggml_log_callback IS NOT EXPOSED IN LLAMA.H
@@ -3571,7 +3528,6 @@ def llama_log_set(
     """Set callback for all future logging events.
 
     If this is not called, or NULL is supplied, everything is output on stderr."""
-    ...
 
 
 # LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);

From ea5ced2f9942ecb1eb621f0825c26b4b496afcd0 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 3 Aug 2024 20:34:37 +0200
Subject: [PATCH 115/177] add more rules

---
 .github/workflows/linter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml
index 4fc91d690..28451b5a2 100644
--- a/.github/workflows/linter.yml
+++ b/.github/workflows/linter.yml
@@ -24,4 +24,4 @@ jobs:
           cache: 'pip'
       - uses: chartboost/ruff-action@v1
         with:
-          args: 'check --select A,AIR,ANN,ARG,ASYNC,B,BLE,C4,C90,COM,DTZ,E,EM,ERA,F,FA,FAST,FBT,FIX,FLY,FURB,G,I,ICN,INP,INT,ISC,LOG,N,NPY,PERF,PGH,PIE,PL,PT,PTH,PYI,Q,R,RET,RSE,RUF,S,SIM,SLF,SLOT,T10,T20,UP,W,YTT --output-format github --diff --exclude docker,docs,scripts,tests,vendor'
+          args: 'check --select A,AIR,ANN,ARG,ASYNC,B,BLE,C4,C90,COM,CPY,DTZ,E,EM,ERA,F,FA,FAST,FBT,FIX,FLY,FURB,G,I,ICN,INP,INT,ISC,LOG,N,NPY,PERF,PGH,PIE,PL,PT,PTH,PYI,Q,R,RET,RSE,RUF,S,SIM,SLF,SLOT,T10,T20,TCH,TD,TID,TRY,UP,W,YTT --output-format github --diff --exclude docker,docs,scripts,tests,vendor'

From a5eec360ad29d3fcf84605004a73611a23c63cfb Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 3 Aug 2024 20:54:17 +0200
Subject: [PATCH 116/177] activation of preview

---
 .github/workflows/linter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml
index 28451b5a2..3226dd1c8 100644
--- a/.github/workflows/linter.yml
+++ b/.github/workflows/linter.yml
@@ -24,4 +24,4 @@ jobs:
           cache: 'pip'
       - uses: chartboost/ruff-action@v1
         with:
-          args: 'check --select A,AIR,ANN,ARG,ASYNC,B,BLE,C4,C90,COM,CPY,DTZ,E,EM,ERA,F,FA,FAST,FBT,FIX,FLY,FURB,G,I,ICN,INP,INT,ISC,LOG,N,NPY,PERF,PGH,PIE,PL,PT,PTH,PYI,Q,R,RET,RSE,RUF,S,SIM,SLF,SLOT,T10,T20,TCH,TD,TID,TRY,UP,W,YTT --output-format github --diff --exclude docker,docs,scripts,tests,vendor'
+          args: 'check --preview --select A,AIR,ANN,ARG,ASYNC,B,BLE,C4,C90,COM,CPY,DTZ,E,EM,ERA,F,FA,FAST,FBT,FIX,FLY,FURB,G,I,ICN,INP,INT,ISC,LOG,N,NPY,PERF,PGH,PIE,PL,PT,PTH,PYI,Q,R,RET,RSE,RUF,S,SIM,SLF,SLOT,T10,T20,TCH,TD,TID,TRY,UP,W,YTT --output-format github --diff --exclude docker,docs,scripts,tests,vendor'

From 4b30a1711548eed77e9f1d82a37ac9408464239f Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 3 Aug 2024 20:58:32 +0200
Subject: [PATCH 117/177] Lint

---
 examples/low_level_api/util.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/examples/low_level_api/util.py b/examples/low_level_api/util.py
index 93d664aed..858bfa2be 100644
--- a/examples/low_level_api/util.py
+++ b/examples/low_level_api/util.py
@@ -52,7 +52,7 @@ def __getitem__(self, val):
                 if self.size < self.maxsize
                 else self.list[(self.offset + val) % self.maxsize]
             )
-        elif isinstance(val, slice):
+        if isinstance(val, slice):
             start, stop, step = val.start, val.stop, val.step
             if step is None:
                 step = 1
@@ -71,8 +71,7 @@ def __getitem__(self, val):
                 for i in indices
                 if i < self.size
             ]
-        else:
-            raise TypeError("Invalid argument type")
+        raise TypeError("Invalid argument type")
 
 
 if __name__ == "__main__":

From f047d4c7030e92f943c71900aa890b1adcf5b74d Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 3 Aug 2024 21:02:02 +0200
Subject: [PATCH 118/177] Lint

---
 llama_cpp/server/app.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 87dcf5567..040ebca7b 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -340,8 +340,7 @@ def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]:
             sep="\n",
             ping_message_factory=_ping_message_factory,
         )
-    else:
-        return iterator_or_completion
+    return iterator_or_completion
 
 
 @router.post(
@@ -533,9 +532,8 @@ def iterator() -> Iterator[llama_cpp.ChatCompletionChunk]:
             sep="\n",
             ping_message_factory=_ping_message_factory,
         )
-    else:
-        exit_stack.close()
-        return iterator_or_completion
+    exit_stack.close()
+    return iterator_or_completion
 
 
 @router.get(

From ac2b64021fb0c2b4f5c88adc063e05d244269b18 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 3 Aug 2024 21:07:35 +0200
Subject: [PATCH 119/177] Lint

---
 llama_cpp/llama.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index eb8a8a6ab..c29fc76bb 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -996,8 +996,8 @@ def decode_batch(seq_sizes: List[int]):
 
         if return_count:
             return output, total_tokens
-        else:
-            return output
+            
+        return output
 
     def _create_completion(
         self,
@@ -1929,8 +1929,8 @@ def create_chat_completion_openai_v1(
             assert isinstance(stream, bool)
             if stream:
                 return (ChatCompletionChunk(**chunk) for chunk in self.create_chat_completion(*args, **kwargs))  # type: ignore
-            else:
-                return ChatCompletion(**self.create_chat_completion(*args, **kwargs))  # type: ignore
+
+            return ChatCompletion(**self.create_chat_completion(*args, **kwargs))  # type: ignore
         except ImportError:
             raise ImportError(
                 "To use create_chat_completion_openai_v1, you must install the openai package."
@@ -2070,12 +2070,12 @@ def pooling_type(self) -> str:
 
     def close(self) -> None:
         """Explicitly free the model from memory."""
-        if hasattr(self,"_stack"):
+        if hasattr(self, "_stack"):
             if self._stack is not None:
                 self._stack.close()
 
     def __del__(self) -> None:
-        if hasattr(self,"_lora_adapter"):
+        if hasattr(self, "_lora_adapter"):
             if self._lora_adapter is not None:
                 llama_cpp.llama_lora_adapter_free(self._lora_adapter)
         self.close()

From 06d285ccb8544a3989450d8e60439568a6a73720 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 3 Aug 2024 21:08:40 +0200
Subject: [PATCH 120/177] Lint

---
 llama_cpp/llava_cpp.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/llava_cpp.py b/llama_cpp/llava_cpp.py
index 33e92b2b0..e11334678 100644
--- a/llama_cpp/llava_cpp.py
+++ b/llama_cpp/llava_cpp.py
@@ -126,8 +126,8 @@ def decorator(f: F) -> F:
                 func.restype = restype
                 functools.wraps(f)(func)
                 return func
-            else:
-                return f
+   
+            return f
 
         return decorator
 

From 1555e2141390fb7d1dc120fc3db4ce806d94b53a Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 3 Aug 2024 21:09:33 +0200
Subject: [PATCH 121/177] Update llama_tokenizer.py

---
 llama_cpp/llama_tokenizer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/llama_tokenizer.py b/llama_cpp/llama_tokenizer.py
index ee952566b..ca31cabd2 100644
--- a/llama_cpp/llama_tokenizer.py
+++ b/llama_cpp/llama_tokenizer.py
@@ -88,8 +88,8 @@ def detokenize(
                 "utf-8", errors="ignore",
             )
             return text[len(prev_text) :]
-        else:
-            return self.hf_tokenizer.decode(tokens).encode("utf-8", errors="ignore")
+
+        return self.hf_tokenizer.decode(tokens).encode("utf-8", errors="ignore")
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: str) -> LlamaHFTokenizer:

From 5fdb66c66785fb6ab3d8b63defe841b6b4c513e3 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 3 Aug 2024 21:10:59 +0200
Subject: [PATCH 122/177] Lint

---
 llama_cpp/_internals.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index b58974b8b..b721fb17f 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -691,8 +691,8 @@ def _should_add_bos(model: _LlamaModel) -> bool:
     add_bos = llama_cpp.llama_add_bos_token(model.model)
     if add_bos != -1:
         return add_bos != 0
-    else:
-        return llama_cpp.llama_vocab_type(model.model) == llama_cpp.LLAMA_VOCAB_TYPE_SPM
+
+    return llama_cpp.llama_vocab_type(model.model) == llama_cpp.LLAMA_VOCAB_TYPE_SPM
 
 
 # Embedding functions
@@ -762,8 +762,8 @@ def cp(self):
     def last(self) -> Optional[int]:
         if len(self.prev) > 0:
             return self.prev[-1]
-        else:
-            return None
+
+        return None
 
     def prev_str(self, ctx_main: _LlamaContext, n: int) -> str:
         return ctx_main.model.detokenize(self.prev[-n:]).decode("utf-8")

From 9ceadda29356cee899a781a12997861f79b38e5a Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 3 Aug 2024 21:12:43 +0200
Subject: [PATCH 123/177] Lint

---
 examples/batch-processing/server.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/batch-processing/server.py b/examples/batch-processing/server.py
index a7e5c8c38..067e1f271 100644
--- a/examples/batch-processing/server.py
+++ b/examples/batch-processing/server.py
@@ -24,7 +24,6 @@
 app = FastAPI()
 
 
-
 @app.post("/v1/chat/completions")
 def create_chat_completions():
     return {"message": "Hello World"}

From ed2f893aaa8d32c58faa4d11d681f118d2f6ed92 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 3 Aug 2024 21:14:01 +0200
Subject: [PATCH 124/177] Lint

---
 llama_cpp/llama_cpp.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 8c6a51bfd..049deed10 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -128,8 +128,8 @@ def decorator(f: F) -> F:
                 func.restype = restype
                 functools.wraps(f)(func)
                 return func
-            else:
-                return f
+
+            return f
 
         return decorator
 

From ac25612c7f0aac06348612b5931d8d97f14e84e2 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 3 Aug 2024 21:25:54 +0200
Subject: [PATCH 125/177] Update linter.yml

---
 .github/workflows/linter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml
index 3226dd1c8..28451b5a2 100644
--- a/.github/workflows/linter.yml
+++ b/.github/workflows/linter.yml
@@ -24,4 +24,4 @@ jobs:
           cache: 'pip'
       - uses: chartboost/ruff-action@v1
         with:
-          args: 'check --preview --select A,AIR,ANN,ARG,ASYNC,B,BLE,C4,C90,COM,CPY,DTZ,E,EM,ERA,F,FA,FAST,FBT,FIX,FLY,FURB,G,I,ICN,INP,INT,ISC,LOG,N,NPY,PERF,PGH,PIE,PL,PT,PTH,PYI,Q,R,RET,RSE,RUF,S,SIM,SLF,SLOT,T10,T20,TCH,TD,TID,TRY,UP,W,YTT --output-format github --diff --exclude docker,docs,scripts,tests,vendor'
+          args: 'check --select A,AIR,ANN,ARG,ASYNC,B,BLE,C4,C90,COM,CPY,DTZ,E,EM,ERA,F,FA,FAST,FBT,FIX,FLY,FURB,G,I,ICN,INP,INT,ISC,LOG,N,NPY,PERF,PGH,PIE,PL,PT,PTH,PYI,Q,R,RET,RSE,RUF,S,SIM,SLF,SLOT,T10,T20,TCH,TD,TID,TRY,UP,W,YTT --output-format github --diff --exclude docker,docs,scripts,tests,vendor'

From 5bdad76b1bb2a9bd2c7426873222744796c976af Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 3 Aug 2024 21:27:14 +0200
Subject: [PATCH 126/177] Lint

---
 llama_cpp/llava_cpp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llava_cpp.py b/llama_cpp/llava_cpp.py
index e11334678..cf09005d6 100644
--- a/llama_cpp/llava_cpp.py
+++ b/llama_cpp/llava_cpp.py
@@ -126,7 +126,7 @@ def decorator(f: F) -> F:
                 func.restype = restype
                 functools.wraps(f)(func)
                 return func
-   
+
             return f
 
         return decorator

From ecd789073db9b639e056fcd3931dc0d30a5049e4 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 3 Aug 2024 21:28:27 +0200
Subject: [PATCH 127/177] Lint

---
 llama_cpp/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index c29fc76bb..5d2a0ca63 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -996,7 +996,7 @@ def decode_batch(seq_sizes: List[int]):
 
         if return_count:
             return output, total_tokens
-            
+
         return output
 
     def _create_completion(

From 19d5afee02975e94c22bd1a6a27613771f89e35b Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 06:06:08 +0200
Subject: [PATCH 128/177] Linter

---
 llama_cpp/llama_cpp.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 7d4c5c3c9..7d1fda3f1 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -1493,7 +1493,6 @@ def llama_model_has_encoder(model: llama_model_p, /) -> bool:
 @ctypes_function("llama_model_has_decoder", [llama_model_p_ctypes], ctypes.c_bool)
 def llama_model_has_decoder(model: llama_model_p, /) -> bool:
     """Returns true if the model contains a decoder that requires llama_decode() call"""
-    ...
 
 
 # // For encoder-decoder models, this function returns id of the token that must be provided

From 52200d5d052235cbee65371e822a828d6735b4c7 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 06:07:32 +0200
Subject: [PATCH 129/177] Lint

---
 llama_cpp/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 8bbd1fef1..81e81ac71 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -775,7 +775,7 @@ def generate(
                 self.n_tokens = longest_prefix
                 if self.verbose:
                     print(f"Llama.generate: {longest_prefix} prefix-match hit, "
-                          f"remaining {len(tokens)} prompt tokens to eval", file=sys.stderr)                    
+                          f"remaining {len(tokens)} prompt tokens to eval", file=sys.stderr)
 
         # Reset the model state
         if reset:

From 84532c28984e731402287a9df41b6869371f9eca Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 06:14:13 +0200
Subject: [PATCH 130/177] Create ci.yml

---
 .github/workflows/ci.yml | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 .github/workflows/ci.yml

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 000000000..ae3dcf44b
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,21 @@
+name: pre-commit
+
+on:
+  pull_request:
+  push:
+    branches: [master]
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        ref: ${{ github.head_ref }}
+    - uses: actions/setup-python@v5
+    - uses: pre-commit/action@v3.0.1
+    - uses: stefanzweifel/git-auto-commit-action@v5
+      with:
+        commit_message: 'pre commit fixes'

From a5e7f87e9dac255fbc200460ac4f4cdfde3eb61b Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 06:15:40 +0200
Subject: [PATCH 131/177] Create .pre-commit-config.yaml

---
 .pre-commit-config.yaml | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 .pre-commit-config.yaml

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 000000000..66b4f94eb
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,19 @@
+repos:
+  # auto update
+  - repo: https://gitlab.com/vojko.pribudic.foss/pre-commit-update
+    rev: "v0.3.3post1"
+    hooks:
+      - id: pre-commit-update
+        args: [--dry-run, --all-versions]
+
+  # ruff
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: "v0.5.7"
+    hooks:
+      # Run the linter.
+      - id: ruff
+        types_or: [ python, pyi, jupyter ]
+        args: [ --fix ]
+      # Run the formatter.
+      - id: ruff-format
+        types_or: [ python, pyi, jupyter ]

From 16df7d62bca0422520b5288347083a2cb78678d0 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 06:24:05 +0200
Subject: [PATCH 132/177] Create ruff.toml

---
 ruff.toml | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 127 insertions(+)
 create mode 100644 ruff.toml

diff --git a/ruff.toml b/ruff.toml
new file mode 100644
index 000000000..b3a022a30
--- /dev/null
+++ b/ruff.toml
@@ -0,0 +1,127 @@
+# Exclude a variety of commonly ignored directories.
+exclude = [
+    ".bzr",
+    ".direnv",
+    ".eggs",
+    ".git",
+    ".git-rewrite",
+    ".hg",
+    ".ipynb",
+    ".ipynb_checkpoints",
+    ".mypy_cache",
+    ".nox",
+    ".pants.d",
+    ".pyenv",
+    ".pytest_cache",
+    ".pytype",
+    ".ruff_cache",
+    ".svn",
+    ".tox",
+    ".venv",
+    ".vscode",
+    "__pypackages__",
+    "_build",
+    "buck-out",
+    "build",
+    "dist",
+    "node_modules",
+    "site-packages",
+    "venv",
+]
+
+# Same as Black.
+line-length = 120
+indent-width = 4
+
+# Assume Python 3.12
+target-version = "py312"
+
+[lint]
+preview = true
+explicit-preview-rules = true
+# Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`)  codes by default.
+select = [
+   "A",     # flake8-builtins
+   "AIR",   # Airflow
+   "ANN",   # flake8-annotations
+   "ARG",   # flake8-unused-arguments
+   "ASYNC", # flake8-async
+   "B",     # flake8-bugbear  
+   "BLE",   # flake8-blind-except
+   "C4",    # flake8-comprehensions
+   "C90",   # McCabe cyclomatic complexity
+   "COM",   # flake8-commas
+   "CPY",   # flake8-copyright
+   # "D",     # pydocstyle
+   # "DJ",    # flake8-django
+   # "DOC",   # pydoclint
+   "DTZ",   # flake8-datetimez
+   "E",     # pycodestyle
+   "EM",    # flake8-errmsg
+   "ERA",   # eradicate
+   # "EXE",   # flake8-executable
+   "F",     # Pyflakes
+   "FA",    # flake8-future-annotations
+   "FAST",  # FastAPI
+   "FBT",   # flake8-boolean-trap
+   "FIX",   # flake8-pp
+   "FLY",   # flynt
+   "FURB",  # refurb
+   "G",     # flake8-logging-format
+   "I",     # isort
+   "ICN",   # flake8-import-conventions
+   "INP",   # flake8-no-pep420
+   "INT",   # flake8-gettext
+   "ISC",   # flake8-implicit-str-concat
+   "LOG",   # flake8-logging
+   "N",     # pep8-naming
+   "NPY",   # NumPy-specific rules
+   # "PD",    # pandas-vet
+   "PERF",  # Perflint
+   "PGH",   # pygrep-hooks
+   "PIE",   # flake8-pie
+   "PL",    # Pylint
+   "PT",    # flake8-pytest-style
+   "PTH",   # flake8-use-pathlib
+   "PYI",   # flake8-pyi
+   "Q",     # flake8-quotes
+   "R",     # Refactor
+   "RET",   # flake8-return
+   "RSE",   # flake8-raise
+   "RUF",   # Ruff-specific rules
+   "S",     # flake8-bandit
+   "SIM",   # flake8-simplify
+   "SLF",   # flake8-self
+   "SLOT",  # flake8-slots
+   "T10",   # flake8-debugger
+   "T20",   # flake8-print
+   "TCH",   # flake8-type-checking
+   "TD",    # flake8-todos
+   "TID",   # flake8-tidy-imports
+   "TRY",   # tryceratops
+   "UP",    # pyupgrade
+   "W",     # pycodestyle
+   "YTT",   # flake8-2020
+]
+ignore = ["COM812","E501","F401","ISC001",]
+
+# Allow fix for all enabled rules (when `--fix`) is provided.
+fixable = ["ALL"]
+unfixable = []
+
+# Allow unused variables when underscore-prefixed.
+dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
+
+[format]
+preview = true
+# Like Black, use double quotes for strings.
+quote-style = "double"
+
+# Like Black, indent with spaces, rather than tabs.
+indent-style = "space"
+
+# Like Black, respect magic trailing commas.
+skip-magic-trailing-comma = false
+
+# Like Black, automatically detect the appropriate line ending.
+line-ending = "auto"

From cb0484349ae5bce8b94a33780c1c6c449335ec84 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 06:27:22 +0200
Subject: [PATCH 133/177] Update pyproject.toml

---
 pyproject.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 7aebb74f5..10d40b6c0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,6 +50,8 @@ dev = [
     "mkdocs-material>=9.1.18",
     "pytest>=7.4.0",
     "httpx>=0.24.1",
+    "pre-commit>=3.8.0",
+    "ruff>=0.5.7",
 ]
 all = [
     "llama_cpp_python[server,test,dev]",

From c1d61addf8918c6f726d089ba22a76b34e0ef492 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 06:29:32 +0200
Subject: [PATCH 134/177] Update ci.yml

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index ae3dcf44b..aefdf2bf7 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -3,7 +3,7 @@ name: pre-commit
 on:
   pull_request:
   push:
-    branches: [master]
+    branches: [main]
 
 jobs:
   pre-commit:

From a3a43d5b58a966b318a11b935e281b7b61a76f84 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 06:32:26 +0200
Subject: [PATCH 135/177] Update ci.yml

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index aefdf2bf7..89752b45f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -1,4 +1,4 @@
-name: pre-commit
+name: ci
 
 on:
   pull_request:

From 68dd94061cc2df938e261ca79f1fc12aa4e22b74 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 06:39:02 +0200
Subject: [PATCH 136/177] Update ci.yml

---
 .github/workflows/ci.yml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 89752b45f..9ea20317a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -11,11 +11,11 @@ jobs:
     permissions:
       contents: write
     steps:
-    - uses: actions/checkout@v4
-      with:
-        ref: ${{ github.head_ref }}
-    - uses: actions/setup-python@v5
-    - uses: pre-commit/action@v3.0.1
-    - uses: stefanzweifel/git-auto-commit-action@v5
-      with:
-        commit_message: 'pre commit fixes'
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ github.head_ref }}
+      - uses: actions/setup-python@v5
+      - uses: pre-commit/action@v3.0.1
+      - uses: stefanzweifel/git-auto-commit-action@v5
+        with:
+          commit_message: 'pre commit fixes'

From f03b2230687ba545b48d6ad8b9cf11c836247214 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 06:43:37 +0200
Subject: [PATCH 137/177] Update ci.yml

---
 .github/workflows/ci.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 9ea20317a..ccf548723 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -12,10 +12,10 @@ jobs:
       contents: write
     steps:
       - uses: actions/checkout@v4
-        with:
-          ref: ${{ github.head_ref }}
+ #       with:
+ #         ref: ${{ github.head_ref }}
       - uses: actions/setup-python@v5
       - uses: pre-commit/action@v3.0.1
-      - uses: stefanzweifel/git-auto-commit-action@v5
-        with:
-          commit_message: 'pre commit fixes'
+ #     - uses: stefanzweifel/git-auto-commit-action@v5
+ #       with:
+ #         commit_message: 'pre commit fixes'

From a3b5956bd11976da4949f3ce22849498b26ff750 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 06:45:58 +0200
Subject: [PATCH 138/177] Delete .github/workflows/linter.yml

---
 .github/workflows/linter.yml | 27 ---------------------------
 1 file changed, 27 deletions(-)
 delete mode 100644 .github/workflows/linter.yml

diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml
deleted file mode 100644
index 28451b5a2..000000000
--- a/.github/workflows/linter.yml
+++ /dev/null
@@ -1,27 +0,0 @@
-name: Ruff
-
-on:
-  push:
-    branches: [ "main" ]
-  pull_request:
-    # The branches below must be a subset of the branches above
-    branches: [ "main" ]
-  schedule:
-    - cron: '30 3 * * *'
-
-permissions:
-  contents: read
-
-jobs:
-  ruff:
-    name: Ruff
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-python@v5
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-      - uses: chartboost/ruff-action@v1
-        with:
-          args: 'check --select A,AIR,ANN,ARG,ASYNC,B,BLE,C4,C90,COM,CPY,DTZ,E,EM,ERA,F,FA,FAST,FBT,FIX,FLY,FURB,G,I,ICN,INP,INT,ISC,LOG,N,NPY,PERF,PGH,PIE,PL,PT,PTH,PYI,Q,R,RET,RSE,RUF,S,SIM,SLF,SLOT,T10,T20,TCH,TD,TID,TRY,UP,W,YTT --output-format github --diff --exclude docker,docs,scripts,tests,vendor'

From b32fd7d86e03761722203c92995d27a444064d92 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 06:48:38 +0200
Subject: [PATCH 139/177] Update .pre-commit-config.yaml

---
 .pre-commit-config.yaml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 66b4f94eb..115751835 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -17,3 +17,9 @@ repos:
       # Run the formatter.
       - id: ruff-format
         types_or: [ python, pyi, jupyter ]
+
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: "v1.11.1"
+    hooks:
+      - id: mypy
+        args: [ '--ignore-missing-imports', '--disable-error-code=top-level-await', "--disable-error-code=empty-body" ]

From 2dc7394825aa11f85555ce2874fbc22f029c2210 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 06:54:07 +0200
Subject: [PATCH 140/177] exclude rule T201

---
 ruff.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ruff.toml b/ruff.toml
index b3a022a30..8dd9e76d0 100644
--- a/ruff.toml
+++ b/ruff.toml
@@ -103,7 +103,7 @@ select = [
    "W",     # pycodestyle
    "YTT",   # flake8-2020
 ]
-ignore = ["COM812","E501","F401","ISC001",]
+ignore = ["COM812","E501","F401","ISC001","T201"]
 
 # Allow fix for all enabled rules (when `--fix`) is provided.
 fixable = ["ALL"]

From 49b543494e3655a4e6653e5ac8f9aafbeba84e62 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 06:56:08 +0200
Subject: [PATCH 141/177] exclude rule ERA001

---
 ruff.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ruff.toml b/ruff.toml
index 8dd9e76d0..5c89e6598 100644
--- a/ruff.toml
+++ b/ruff.toml
@@ -103,7 +103,7 @@ select = [
    "W",     # pycodestyle
    "YTT",   # flake8-2020
 ]
-ignore = ["COM812","E501","F401","ISC001","T201"]
+ignore = ["COM812","E501","ERA001","F401","ISC001","T201"]
 
 # Allow fix for all enabled rules (when `--fix`) is provided.
 fixable = ["ALL"]

From 772c5b30884784f109f6afb835309529fd7b46e2 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 06:59:40 +0200
Subject: [PATCH 142/177] Update ruff.toml

---
 ruff.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ruff.toml b/ruff.toml
index 5c89e6598..6eae71b6f 100644
--- a/ruff.toml
+++ b/ruff.toml
@@ -103,7 +103,7 @@ select = [
    "W",     # pycodestyle
    "YTT",   # flake8-2020
 ]
-ignore = ["COM812","E501","ERA001","F401","ISC001","T201"]
+ignore = ["A001","A002","ANN001","ANN202","COM812","E501","ERA001","F401","ISC001","T201"]
 
 # Allow fix for all enabled rules (when `--fix`) is provided.
 fixable = ["ALL"]

From d04fab6fa2cb47851c63ca4774469f3ecd2f5955 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 07:01:46 +0200
Subject: [PATCH 143/177] Update ruff.toml

---
 ruff.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ruff.toml b/ruff.toml
index 6eae71b6f..de9934d7e 100644
--- a/ruff.toml
+++ b/ruff.toml
@@ -103,7 +103,7 @@ select = [
    "W",     # pycodestyle
    "YTT",   # flake8-2020
 ]
-ignore = ["A001","A002","ANN001","ANN202","COM812","E501","ERA001","F401","ISC001","T201"]
+ignore = ["A001","A002","ANN001","ANN201","ANN202","COM812","E501","ERA001","F401","ISC001","T201"]
 
 # Allow fix for all enabled rules (when `--fix`) is provided.
 fixable = ["ALL"]

From d0285438caa7e5a6a854deea434edc682b4fd60e Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 07:15:09 +0200
Subject: [PATCH 144/177] Create fixer.yml

---
 .github/workflows/fixer.yml | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100644 .github/workflows/fixer.yml

diff --git a/.github/workflows/fixer.yml b/.github/workflows/fixer.yml
new file mode 100644
index 000000000..3c451f05a
--- /dev/null
+++ b/.github/workflows/fixer.yml
@@ -0,0 +1,34 @@
+name: Fixer
+
+on: [push, pull_request]
+
+concurrency:
+  group: fixer-${{ github.event_name == 'pull_request' && format('{0}-{1}', github.workflow, github.event.pull_request.number) || github.workflow_ref }}
+  cancel-in-progress: true
+
+jobs:
+  ruff-lint:
+    name: Ruff
+    runs-on: ubuntu-latest
+    permissions:
+      # Give the default GITHUB_TOKEN write permission to commit and push the
+      # added or changed files to the repository.
+      contents: write
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ github.head_ref }}
+  
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+          cache: 'pip'
+      
+      - uses: chartboost/ruff-action@v1
+        with:
+          args: 'check --preview --fix-only'
+
+      - uses: stefanzweifel/git-auto-commit-action@v5
+        with:
+          commit_message: 'style fixes by ruff'

From e1e861e8a84baa47221a2dca69799bd9185a7c5b Mon Sep 17 00:00:00 2001
From: Smartappli <Smartappli@users.noreply.github.com>
Date: Thu, 15 Aug 2024 05:15:25 +0000
Subject: [PATCH 145/177] style fixes by ruff

---
 docker/open_llama/hug_model.py                |  60 +-
 .../high_level_api/langchain_custom_llm.py    |   5 +-
 examples/low_level_api/common.py              |   2 +-
 examples/notebooks/Functions.ipynb            |  23 +-
 .../notebooks/OpenHermesFunctionCalling.ipynb |  11 +-
 examples/notebooks/PerformanceTuning.ipynb    |   9 +-
 examples/ray/llm.py                           |   4 +-
 llama_cpp/_internals.py                       |  20 +-
 llama_cpp/_utils.py                           |   2 +-
 llama_cpp/llama.py                            | 245 +++--
 llama_cpp/llama_cache.py                      |  18 +-
 llama_cpp/llama_chat_format.py                | 960 +++++++++---------
 llama_cpp/llama_cpp.py                        | 283 +++---
 llama_cpp/llama_tokenizer.py                  |   6 +-
 llama_cpp/llama_types.py                      |  72 +-
 llama_cpp/llava_cpp.py                        |  25 +-
 llama_cpp/server/app.py                       |  32 +-
 llama_cpp/server/cli.py                       |  26 +-
 llama_cpp/server/errors.py                    |  39 +-
 llama_cpp/server/model.py                     |  14 +-
 llama_cpp/server/settings.py                  |  43 +-
 llama_cpp/server/types.py                     |  92 +-
 tests/test_llama.py                           |  10 +-
 tests/test_llama_chat_format.py               |   6 +-
 tests/test_llama_grammar.py                   |   5 +-
 tests/test_llama_speculative.py               |   1 +
 26 files changed, 944 insertions(+), 1069 deletions(-)

diff --git a/docker/open_llama/hug_model.py b/docker/open_llama/hug_model.py
index 13c5b6b0d..ee23821a6 100644
--- a/docker/open_llama/hug_model.py
+++ b/docker/open_llama/hug_model.py
@@ -1,26 +1,27 @@
-import requests
+import argparse
 import json
 import os
 import struct
-import argparse
+
+import requests
+
 
 def make_request(url, params=None):
     print(f"Making request to {url}...")
     response = requests.get(url, params=params)
     if response.status_code == 200:
         return json.loads(response.text)
-    else:
-        print(f"Request failed with status code {response.status_code}")
-        return None
+    print(f"Request failed with status code {response.status_code}")
+    return None
 
 def check_magic_and_version(filename):
-    with open(filename, 'rb') as f:
+    with open(filename, "rb") as f:
         # Read the first 6 bytes from the file
         data = f.read(6)
 
     # Unpack the binary data, interpreting the first 4 bytes as a little-endian unsigned int
     # and the next 2 bytes as a little-endian unsigned short
-    magic, version = struct.unpack('<I H', data)
+    magic, version = struct.unpack("<I H", data)
 
     print(f"magic: 0x{magic:08x}, version: 0x{version:04x}, file: {filename}")
 
@@ -30,17 +31,17 @@ def download_file(url, destination):
     print(f"Downloading {url} to {destination}...")
     response = requests.get(url, stream=True)
     if response.status_code == 200:
-        with open(destination, 'wb') as f:
+        with open(destination, "wb") as f:
             total_downloaded = 0
             for chunk in response.iter_content(chunk_size=1024):
                 if chunk:  # filter out keep-alive new chunks
                     f.write(chunk)
                     total_downloaded += len(chunk)
                     if total_downloaded >= 10485760:  # 10 MB
-                        print('.', end='', flush=True)
+                        print(".", end="", flush=True)
                         total_downloaded = 0
         print("\nDownload complete.")
-        
+
         # Creating a symbolic link from destination to "model.bin"
         if os.path.isfile("model.bin"):
             os.remove("model.bin")  # remove the existing link if any
@@ -61,30 +62,29 @@ def get_user_choice(model_list):
         if 0 <= index < len(model_list):
             # Return the chosen model
             return model_list[index]
-        else:
-            print("Invalid choice.")
+        print("Invalid choice.")
     except ValueError:
         print("Invalid input. Please enter a number corresponding to a model.")
     except IndexError:
         print("Invalid choice. Index out of range.")
-    
+
     return None
 
 def main():
     # Create an argument parser
-    parser = argparse.ArgumentParser(description='Process some parameters.')
+    parser = argparse.ArgumentParser(description="Process some parameters.")
 
     # Arguments
-    parser.add_argument('-v', '--version', type=int, default=0x0003,
-                        help='hexadecimal version number of ggml file')
-    parser.add_argument('-a', '--author', type=str, default='TheBloke',
-                        help='HuggingFace author filter')
-    parser.add_argument('-t', '--tag', type=str, default='llama',
-                        help='HuggingFace tag filter')
-    parser.add_argument('-s', '--search', type=str, default='',
-                        help='HuggingFace search filter')
-    parser.add_argument('-f', '--filename', type=str, default='q5_1',
-                        help='HuggingFace model repository filename substring match')
+    parser.add_argument("-v", "--version", type=int, default=0x0003,
+                        help="hexadecimal version number of ggml file")
+    parser.add_argument("-a", "--author", type=str, default="TheBloke",
+                        help="HuggingFace author filter")
+    parser.add_argument("-t", "--tag", type=str, default="llama",
+                        help="HuggingFace tag filter")
+    parser.add_argument("-s", "--search", type=str, default="",
+                        help="HuggingFace search filter")
+    parser.add_argument("-f", "--filename", type=str, default="q5_1",
+                        help="HuggingFace model repository filename substring match")
 
     # Parse the arguments
     args = parser.parse_args()
@@ -96,20 +96,20 @@ def main():
         "search": args.search
     }
 
-    models = make_request('https://huggingface.co/api/models', params=params)
+    models = make_request("https://huggingface.co/api/models", params=params)
     if models is None:
         return
 
     model_list = []
     # Iterate over the models
     for model in models:
-        model_id = model['id']
-        model_info = make_request(f'https://huggingface.co/api/models/{model_id}')
+        model_id = model["id"]
+        model_info = make_request(f"https://huggingface.co/api/models/{model_id}")
         if model_info is None:
             continue
 
-        for sibling in model_info.get('siblings', []):
-            rfilename = sibling.get('rfilename')
+        for sibling in model_info.get("siblings", []):
+            rfilename = sibling.get("rfilename")
             if rfilename and args.filename in rfilename:
                 model_list.append((model_id, rfilename))
 
@@ -135,5 +135,5 @@ def main():
         print("Error - model choice was None")
         exit(2)
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/high_level_api/langchain_custom_llm.py b/examples/high_level_api/langchain_custom_llm.py
index adbae6ce4..091cbbb69 100644
--- a/examples/high_level_api/langchain_custom_llm.py
+++ b/examples/high_level_api/langchain_custom_llm.py
@@ -1,5 +1,6 @@
 import argparse
-from typing import Any, List, Mapping, Optional
+from collections.abc import Mapping
+from typing import Any, List, Optional
 
 from langchain.llms.base import LLM
 
@@ -19,7 +20,7 @@ def __init__(self, model_path: str, **kwargs: Any):
         llm = Llama(model_path=model_path)
         super().__init__(model_path=model_path, llm=llm, **kwargs)
 
-    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
+    def _call(self, prompt: str, stop: list[str] | None = None) -> str:
         response = self.llm(prompt, stop=stop or [])
         return response["choices"][0]["text"]
 
diff --git a/examples/low_level_api/common.py b/examples/low_level_api/common.py
index b924ad13f..7ec141f2a 100644
--- a/examples/low_level_api/common.py
+++ b/examples/low_level_api/common.py
@@ -37,7 +37,7 @@ class GptParams:
     path_session: str = ""
     input_prefix: str = " "
     input_suffix: str = ""
-    antiprompt: List[str] = field(default_factory=list)
+    antiprompt: list[str] = field(default_factory=list)
 
     lora_adapter: str = ""
     lora_base: str = ""
diff --git a/examples/notebooks/Functions.ipynb b/examples/notebooks/Functions.ipynb
index 1f4138165..12438d900 100644
--- a/examples/notebooks/Functions.ipynb
+++ b/examples/notebooks/Functions.ipynb
@@ -40,9 +40,9 @@
     }
    ],
    "source": [
-    "import openai\n",
     "import json\n",
     "\n",
+    "import openai\n",
     "\n",
     "client = openai.OpenAI(\n",
     "    api_key=\"sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\",  # can be anything\n",
@@ -56,14 +56,13 @@
     "    \"\"\"Get the current weather in a given location\"\"\"\n",
     "    if \"tokyo\" in location.lower():\n",
     "        return json.dumps({\"location\": \"Tokyo\", \"temperature\": \"10\", \"unit\": \"celsius\"})\n",
-    "    elif \"san francisco\" in location.lower():\n",
+    "    if \"san francisco\" in location.lower():\n",
     "        return json.dumps(\n",
     "            {\"location\": \"San Francisco\", \"temperature\": \"72\", \"unit\": \"fahrenheit\"}\n",
     "        )\n",
-    "    elif \"paris\" in location.lower():\n",
+    "    if \"paris\" in location.lower():\n",
     "        return json.dumps({\"location\": \"Paris\", \"temperature\": \"22\", \"unit\": \"celsius\"})\n",
-    "    else:\n",
-    "        return json.dumps({\"location\": location, \"temperature\": \"unknown\"})\n",
+    "    return json.dumps({\"location\": location, \"temperature\": \"unknown\"})\n",
     "\n",
     "\n",
     "def run_conversation():\n",
@@ -298,7 +297,7 @@
     "    Class for a multi-class label prediction.\n",
     "    \"\"\"\n",
     "\n",
-    "    class_labels: List[MultiLabels]\n",
+    "    class_labels: list[MultiLabels]\n",
     "\n",
     "\n",
     "def multi_classify(data: str) -> MultiClassPrediction:\n",
@@ -346,11 +345,10 @@
     }
    ],
    "source": [
-    "from typing_extensions import Annotated\n",
-    "from pydantic import BaseModel, BeforeValidator\n",
+    "from typing import Annotated\n",
     "\n",
     "from instructor import llm_validator\n",
-    "\n",
+    "from pydantic import BaseModel, BeforeValidator\n",
     "\n",
     "question = \"What is the meaning of life?\"\n",
     "context = \"The according to the devil the meaning of live is to live a life of sin and debauchery.\"\n",
@@ -429,14 +427,13 @@
    ],
    "source": [
     "import re\n",
-    "from typing import List\n",
     "\n",
-    "from pydantic import Field, BaseModel, model_validator, FieldValidationInfo\n",
+    "from pydantic import BaseModel, Field, FieldValidationInfo, model_validator\n",
     "\n",
     "\n",
     "class Fact(BaseModel):\n",
     "    fact: str = Field(...)\n",
-    "    substring_quote: List[str] = Field(...)\n",
+    "    substring_quote: list[str] = Field(...)\n",
     "\n",
     "    @model_validator(mode=\"after\")\n",
     "    def validate_sources(self, info: FieldValidationInfo) -> \"Fact\":\n",
@@ -456,7 +453,7 @@
     "\n",
     "class QuestionAnswer(BaseModel):\n",
     "    question: str = Field(...)\n",
-    "    answer: List[Fact] = Field(...)\n",
+    "    answer: list[Fact] = Field(...)\n",
     "\n",
     "    @model_validator(mode=\"after\")\n",
     "    def validate_sources(self) -> \"QuestionAnswer\":\n",
diff --git a/examples/notebooks/OpenHermesFunctionCalling.ipynb b/examples/notebooks/OpenHermesFunctionCalling.ipynb
index 13128be04..e4d9366aa 100644
--- a/examples/notebooks/OpenHermesFunctionCalling.ipynb
+++ b/examples/notebooks/OpenHermesFunctionCalling.ipynb
@@ -38,8 +38,8 @@
     }
    ],
    "source": [
-    "import json\n",
     "import inspect\n",
+    "import json\n",
     "from typing import get_type_hints\n",
     "\n",
     "\n",
@@ -61,7 +61,6 @@
     "    \"\"\"Get the monthly mortgage payment given an interest rate percentage.\"\"\"\n",
     "\n",
     "    # TODO: you must implement this to actually call it later\n",
-    "    pass\n",
     "\n",
     "\n",
     "def get_article_details(\n",
@@ -75,14 +74,12 @@
     "    date_published: formatted as \"MM/DD/YYYY\"'''\n",
     "\n",
     "    # TODO: you must implement this to actually call it later\n",
-    "    pass\n",
     "\n",
     "\n",
     "def get_weather(zip_code: str) -> Weather:\n",
     "    \"\"\"Get the current weather given a zip code.\"\"\"\n",
     "\n",
     "    # TODO: you must implement this to actually call it later\n",
-    "    pass\n",
     "\n",
     "\n",
     "def get_directions(start: str, destination: str) -> Directions:\n",
@@ -91,15 +88,13 @@
     "    destination: end address as a string including zipcode (if any)\"\"\"\n",
     "\n",
     "    # TODO: you must implement this to actually call it later\n",
-    "    pass\n",
     "\n",
     "\n",
     "def get_type_name(t):\n",
     "    name = str(t)\n",
     "    if \"list\" in name or \"dict\" in name:\n",
     "        return name\n",
-    "    else:\n",
-    "        return t.__name__\n",
+    "    return t.__name__\n",
     "\n",
     "\n",
     "def serialize_function_to_json(func):\n",
@@ -129,8 +124,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import xml.etree.ElementTree as ET\n",
     "import re\n",
+    "import xml.etree.ElementTree as ET\n",
     "\n",
     "\n",
     "def extract_function_calls(completion):\n",
diff --git a/examples/notebooks/PerformanceTuning.ipynb b/examples/notebooks/PerformanceTuning.ipynb
index ba74e4a41..04c1fb1d2 100644
--- a/examples/notebooks/PerformanceTuning.ipynb
+++ b/examples/notebooks/PerformanceTuning.ipynb
@@ -6,18 +6,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import time\n",
     "import json\n",
     "import multiprocessing\n",
-    "\n",
-    "import llama_cpp\n",
+    "import time\n",
     "\n",
     "import numpy as np\n",
     "\n",
-    "np.int = int\n",
+    "import llama_cpp\n",
     "\n",
-    "from skopt.space import Integer, Categorical\n",
+    "int = int\n",
     "\n",
+    "from skopt.space import Categorical, Integer\n",
     "\n",
     "MODEL_PATH = \"../models/ggml-model.bin\"\n",
     "\n",
diff --git a/examples/ray/llm.py b/examples/ray/llm.py
index 7900571d1..21b3a4ffd 100755
--- a/examples/ray/llm.py
+++ b/examples/ray/llm.py
@@ -12,12 +12,12 @@ class LlamaDeployment:
     def __init__(self, model_path: str):
         self._llm = Llama(model_path=model_path)
 
-    async def __call__(self, http_request: Request) -> Dict:
+    async def __call__(self, http_request: Request) -> dict:
         input_json = await http_request.json()
         prompt = input_json["prompt"]
         max_tokens = input_json.get("max_tokens", 64)
         return self._llm(prompt, max_tokens=max_tokens)
 
 
-def llm_builder(args: Dict[str, str]) -> Application:
+def llm_builder(args: dict[str, str]) -> Application:
     return LlamaDeployment.bind(args["model_path"])
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index 5f6b2d231..222fcfd36 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -2,13 +2,13 @@
 
 import ctypes
 import os
+from collections.abc import Sequence
 from contextlib import ExitStack
 from dataclasses import dataclass, field
 from typing import (
     Dict,
     List,
     Optional,
-    Sequence,
 )
 
 import numpy as np
@@ -108,7 +108,7 @@ def apply_lora_from_file(
         self,
         lora_path: str,
         scale: float,
-        path_base_model: Optional[str],
+        path_base_model: str | None,
         n_threads: int,
     ):
         assert self.model is not None
@@ -212,7 +212,7 @@ def token_to_piece(self, token: int, special: bool = False) -> bytes:
         llama_cpp.llama_token_to_piece(self.model, token, buf, 32, 0, special)
         return bytes(buf)
 
-    def detokenize(self, tokens: List[int], special: bool = False) -> bytes:
+    def detokenize(self, tokens: list[int], special: bool = False) -> bytes:
         assert self.model is not None
         output = b""
         size = 32
@@ -232,9 +232,9 @@ def detokenize(self, tokens: List[int], special: bool = False) -> bytes:
         )
 
     # Extra
-    def metadata(self) -> Dict[str, str]:
+    def metadata(self) -> dict[str, str]:
         assert self.model is not None
-        metadata: Dict[str, str] = {}
+        metadata: dict[str, str] = {}
         buffer_size = 1024
         buffer = ctypes.create_string_buffer(buffer_size)
         # zero the buffer
@@ -665,7 +665,7 @@ def _token_to_piece(model: _LlamaModel, token: int, special: bool = False) -> st
     return bytes(result).decode("utf-8")
 
 
-def _detokenize_spm(model: _LlamaModel, tokens: List[int]) -> str:
+def _detokenize_spm(model: _LlamaModel, tokens: list[int]) -> str:
     bos_id = model.token_bos()
     result = ""
     for i, token in enumerate(tokens):
@@ -678,7 +678,7 @@ def _detokenize_spm(model: _LlamaModel, tokens: List[int]) -> str:
     return result
 
 
-def _detokenize_bpe(model: _LlamaModel, tokens: List[int]) -> str:
+def _detokenize_bpe(model: _LlamaModel, tokens: list[int]) -> str:
     result = ""
     for token in tokens:
         piece = _token_to_piece(model, token)
@@ -739,7 +739,7 @@ class _LlamaSamplingParams:
 class _LlamaSamplingContext:
     params: _LlamaSamplingParams = field(default_factory=_LlamaSamplingParams)
     mirostat_mu: ctypes.c_float = field(default_factory=ctypes.c_float)
-    grammar: Optional[LlamaGrammar] = None
+    grammar: LlamaGrammar | None = None
     # NOTE: Missing parsed_grammar
     prev: list[int] = field(default_factory=list)
     cur: list[llama_cpp.llama_token_data] = field(default_factory=list)
@@ -759,7 +759,7 @@ def cp(self):
             cur=self.cur.copy(),
         )
 
-    def last(self) -> Optional[int]:
+    def last(self) -> int | None:
         if len(self.prev) > 0:
             return self.prev[-1]
 
@@ -772,7 +772,7 @@ def sample(
         self,
         ctx_main: _LlamaContext,
         idx: int = 0,
-        logits_array: Optional[npt.NDArray[np.single]] = None,
+        logits_array: npt.NDArray[np.single] | None = None,
     ):
         n_vocab = ctx_main.model.n_vocab()
         id: int = 0
diff --git a/llama_cpp/_utils.py b/llama_cpp/_utils.py
index 729672f12..0e1d6e78e 100644
--- a/llama_cpp/_utils.py
+++ b/llama_cpp/_utils.py
@@ -60,7 +60,7 @@ class MetaSingleton(type):
     Metaclass for implementing the Singleton pattern.
     """
 
-    _instances: Dict[type, Any] = {}
+    _instances: dict[type, Any] = {}
 
     def __call__(cls, *args: Any, **kwargs: Any) -> Any:
         if cls not in cls._instances:
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 81e81ac71..6ddf5b7c7 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -12,18 +12,15 @@
 import uuid
 import warnings
 from collections import deque
+from collections.abc import Callable, Generator, Iterator, Sequence
 from pathlib import Path
 from typing import (
     Any,
-    Callable,
     Deque,
     Dict,
-    Generator,
-    Iterator,
     List,
     Literal,
     Optional,
-    Sequence,
     Union,
 )
 
@@ -65,21 +62,19 @@ def __init__(
         n_gpu_layers: int = 0,
         split_mode: int = llama_cpp.LLAMA_SPLIT_MODE_LAYER,
         main_gpu: int = 0,
-        tensor_split: Optional[List[float]] = None,
-        rpc_servers: Optional[str] = None,
+        tensor_split: list[float] | None = None,
+        rpc_servers: str | None = None,
         vocab_only: bool = False,
         use_mmap: bool = True,
         use_mlock: bool = False,
-        kv_overrides: Optional[Dict[str, Union[bool, int, float, str]]] = None,
+        kv_overrides: dict[str, bool | int | float | str] | None = None,
         # Context Params
         seed: int = llama_cpp.LLAMA_DEFAULT_SEED,
         n_ctx: int = 512,
         n_batch: int = 512,
-        n_threads: Optional[int] = None,
-        n_threads_batch: Optional[int] = None,
-        rope_scaling_type: Optional[
-            int
-        ] = llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
+        n_threads: int | None = None,
+        n_threads_batch: int | None = None,
+        rope_scaling_type: int | None = llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
         pooling_type: int = llama_cpp.LLAMA_POOLING_TYPE_UNSPECIFIED,
         rope_freq_base: float = 0.0,
         rope_freq_scale: float = 0.0,
@@ -95,21 +90,21 @@ def __init__(
         # Sampling Params
         last_n_tokens_size: int = 64,
         # LoRA Params
-        lora_base: Optional[str] = None,
+        lora_base: str | None = None,
         lora_scale: float = 1.0,
-        lora_path: Optional[str] = None,
+        lora_path: str | None = None,
         # Backend Params
-        numa: Union[bool, int] = False,
+        numa: bool | int = False,
         # Chat Format Params
-        chat_format: Optional[str] = None,
-        chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None,
+        chat_format: str | None = None,
+        chat_handler: llama_chat_format.LlamaChatCompletionHandler | None = None,
         # Speculative Decoding
-        draft_model: Optional[LlamaDraftModel] = None,
+        draft_model: LlamaDraftModel | None = None,
         # Tokenizer Override
-        tokenizer: Optional[BaseLlamaTokenizer] = None,
+        tokenizer: BaseLlamaTokenizer | None = None,
         # KV cache quantization
-        type_k: Optional[int] = None,
-        type_v: Optional[int] = None,
+        type_k: int | None = None,
+        type_v: int | None = None,
         # Misc
         spm_infill: bool = False,
         verbose: bool = True,
@@ -347,7 +342,7 @@ def __init__(
         # Sampling Params
         self.last_n_tokens_size = last_n_tokens_size
 
-        self.cache: Optional[BaseLlamaCache] = None
+        self.cache: BaseLlamaCache | None = None
 
         self.lora_base = lora_base
         self.lora_scale = lora_scale
@@ -401,7 +396,7 @@ def __init__(
             ),
         )
 
-        self._lora_adapter: Optional[llama_cpp.llama_lora_adapter_p] = None
+        self._lora_adapter: llama_cpp.llama_lora_adapter_p | None = None
 
         if self.lora_path:
             assert self._model.model is not None
@@ -426,7 +421,7 @@ def __init__(
 
         self.chat_format = chat_format
         self.chat_handler = chat_handler
-        self._chat_handlers: Dict[str, llama_chat_format.LlamaChatCompletionHandler] = (
+        self._chat_handlers: dict[str, llama_chat_format.LlamaChatCompletionHandler] = (
             {}
         )
 
@@ -546,11 +541,11 @@ def _scores(self) -> npt.NDArray[np.single]:
         return self.scores[: self.n_tokens, :]
 
     @property
-    def eval_tokens(self) -> Deque[int]:
+    def eval_tokens(self) -> deque[int]:
         return deque(self.input_ids[: self.n_tokens].tolist(), maxlen=self._n_ctx)
 
     @property
-    def eval_logits(self) -> Deque[List[float]]:
+    def eval_logits(self) -> deque[list[float]]:
         return deque(
             self.scores[: self.n_tokens, :].tolist(),
             maxlen=self._n_ctx if self.context_params.logits_all else 1,
@@ -558,7 +553,7 @@ def eval_logits(self) -> Deque[List[float]]:
 
     def tokenize(
         self, text: bytes, add_bos: bool = True, special: bool = False,
-    ) -> List[int]:
+    ) -> list[int]:
         """Tokenize a string.
 
         Args:
@@ -573,7 +568,7 @@ def tokenize(
         return self.tokenizer_.tokenize(text, add_bos, special)
 
     def detokenize(
-        self, tokens: List[int], prev_tokens: Optional[List[int]] = None,
+        self, tokens: list[int], prev_tokens: list[int] | None = None,
     ) -> bytes:
         """Detokenize a list of tokens.
 
@@ -586,7 +581,7 @@ def detokenize(
         """
         return self.tokenizer_.detokenize(tokens, prev_tokens=prev_tokens)
 
-    def set_cache(self, cache: Optional[BaseLlamaCache]):
+    def set_cache(self, cache: BaseLlamaCache | None):
         """Set the cache.
 
         Args:
@@ -659,9 +654,9 @@ def sample(
         mirostat_eta: float = 0.1,
         mirostat_tau: float = 5.0,
         penalize_nl: bool = True,
-        logits_processor: Optional[LogitsProcessorList] = None,
-        grammar: Optional[LlamaGrammar] = None,
-        idx: Optional[int] = None,
+        logits_processor: LogitsProcessorList | None = None,
+        grammar: LlamaGrammar | None = None,
+        idx: int | None = None,
     ):
         """Sample a token from the model.
 
@@ -735,10 +730,10 @@ def generate(
         mirostat_tau: float = 5.0,
         mirostat_eta: float = 0.1,
         penalize_nl: bool = True,
-        logits_processor: Optional[LogitsProcessorList] = None,
-        stopping_criteria: Optional[StoppingCriteriaList] = None,
-        grammar: Optional[LlamaGrammar] = None,
-    ) -> Generator[int, Optional[Sequence[int]], None]:
+        logits_processor: LogitsProcessorList | None = None,
+        stopping_criteria: StoppingCriteriaList | None = None,
+        grammar: LlamaGrammar | None = None,
+    ) -> Generator[int, Sequence[int] | None, None]:
         """Create a generator of tokens from a prompt.
 
         Examples:
@@ -764,7 +759,7 @@ def generate(
         # Check for kv cache prefix match
         if reset and self.n_tokens > 0:
             longest_prefix = 0
-            for a, b in zip(self._input_ids, tokens[:-1]):
+            for a, b in zip(self._input_ids, tokens[:-1], strict=False):
                 if a == b:
                     longest_prefix += 1
                 else:
@@ -839,7 +834,7 @@ def generate(
                 )
 
     def create_embedding(
-        self, input: Union[str, List[str]], model: Optional[str] = None,
+        self, input: str | list[str], model: str | None = None,
     ) -> CreateEmbeddingResponse:
         """Embed a string.
 
@@ -855,12 +850,12 @@ def create_embedding(
         input = input if isinstance(input, list) else [input]
 
         # get numeric embeddings
-        embeds: Union[List[List[float]], List[List[List[float]]]]
+        embeds: list[list[float]] | list[list[list[float]]]
         total_tokens: int
         embeds, total_tokens = self.embed(input, return_count=True)  # type: ignore
 
         # convert to CreateEmbeddingResponse
-        data: List[Embedding] = [
+        data: list[Embedding] = [
             {
                 "object": "embedding",
                 "embedding": emb,
@@ -881,7 +876,7 @@ def create_embedding(
 
     def embed(
         self,
-        input: Union[str, List[str]],
+        input: str | list[str],
         normalize: bool = False,
         truncate: bool = True,
         return_count: bool = False,
@@ -919,9 +914,9 @@ def embed(
         self._batch.reset()
 
         # decode and fetch embeddings
-        data: Union[List[List[float]], List[List[List[float]]]] = []
+        data: list[list[float]] | list[list[list[float]]] = []
 
-        def decode_batch(seq_sizes: List[int]):
+        def decode_batch(seq_sizes: list[int]):
             assert self._ctx.ctx is not None
             llama_cpp.llama_kv_cache_clear(self._ctx.ctx)
             self._ctx.decode(self._batch)
@@ -932,7 +927,7 @@ def decode_batch(seq_sizes: List[int]):
                 pos: int = 0
                 for i, size in enumerate(seq_sizes):
                     ptr = llama_cpp.llama_get_embeddings(self._ctx.ctx)
-                    embedding: List[List[float]] = [
+                    embedding: list[list[float]] = [
                         ptr[pos + j * n_embd : pos + (j + 1) * n_embd]
                         for j in range(size)
                     ]
@@ -943,7 +938,7 @@ def decode_batch(seq_sizes: List[int]):
             else:
                 for i in range(len(seq_sizes)):
                     ptr = llama_cpp.llama_get_embeddings_seq(self._ctx.ctx, i)
-                    embedding: List[float] = ptr[:n_embd]
+                    embedding: list[float] = ptr[:n_embd]
                     if normalize:
                         embedding = _normalize_embedding(embedding)
                     data.append(embedding)
@@ -1002,34 +997,32 @@ def decode_batch(seq_sizes: List[int]):
 
     def _create_completion(
         self,
-        prompt: Union[str, List[int]],
-        suffix: Optional[str] = None,
-        max_tokens: Optional[int] = 16,
+        prompt: str | list[int],
+        suffix: str | None = None,
+        max_tokens: int | None = 16,
         temperature: float = 0.8,
         top_p: float = 0.95,
         min_p: float = 0.05,
         typical_p: float = 1.0,
-        logprobs: Optional[int] = None,
+        logprobs: int | None = None,
         echo: bool = False,
-        stop: Optional[Union[str, List[str]]] = [],
+        stop: str | list[str] | None = [],
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
         repeat_penalty: float = 1.0,
         top_k: int = 40,
         stream: bool = False,
-        seed: Optional[int] = None,
+        seed: int | None = None,
         tfs_z: float = 1.0,
         mirostat_mode: int = 0,
         mirostat_tau: float = 5.0,
         mirostat_eta: float = 0.1,
-        model: Optional[str] = None,
-        stopping_criteria: Optional[StoppingCriteriaList] = None,
-        logits_processor: Optional[LogitsProcessorList] = None,
-        grammar: Optional[LlamaGrammar] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-    ) -> Union[
-        Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse],
-    ]:
+        model: str | None = None,
+        stopping_criteria: StoppingCriteriaList | None = None,
+        logits_processor: LogitsProcessorList | None = None,
+        grammar: LlamaGrammar | None = None,
+        logit_bias: dict[str, float] | None = None,
+    ) -> Iterator[CreateCompletionResponse] | Iterator[CreateCompletionStreamResponse]:
         assert self._ctx is not None
         assert suffix is None or suffix.__class__ is str
 
@@ -1044,8 +1037,8 @@ def _create_completion(
         add_space_prefix: bool = (
             self.metadata.get("tokenizer.ggml.add_space_prefix", "true") == "true"
         )
-        bos_tokens: List[int] = [cls_token_id if cls_token_id != -1 else bos_token_id]
-        eos_tokens: List[int] = [
+        bos_tokens: list[int] = [cls_token_id if cls_token_id != -1 else bos_token_id]
+        eos_tokens: list[int] = [
             sep_token_id if sep_token_id != -1 else self.token_eos(),
         ]
 
@@ -1069,9 +1062,9 @@ def _create_completion(
 
         # If prompt is empty, initialize completion with BOS token to avoid
         # detokenization including a space at the beginning of the completion
-        completion_tokens: List[int] = [] if len(prompt) > 0 else [bos_token_id]
+        completion_tokens: list[int] = [] if len(prompt) > 0 else [bos_token_id]
         # Add blank space to start of prompt to match OG llama tokenizer
-        prefix_tokens: List[int] = (
+        prefix_tokens: list[int] = (
             [prefix_token_id] if prefix_token_id >= 0 and suffix is not None else []
         ) + (
             (
@@ -1086,7 +1079,7 @@ def _create_completion(
             if isinstance(prompt, str)
             else prompt
         )
-        suffix_tokens: List[int] = (
+        suffix_tokens: list[int] = (
             (
                 [suffix_token_id]
                 + (
@@ -1100,10 +1093,10 @@ def _create_completion(
             if suffix_token_id >= 0 and suffix is not None
             else []
         )
-        middle_tokens: List[int] = (
+        middle_tokens: list[int] = (
             [middle_token_id] if middle_token_id >= 0 and suffix is not None else []
         )
-        prompt_tokens: List[int] = (
+        prompt_tokens: list[int] = (
             bos_tokens
             + (
                 (suffix_tokens + prefix_tokens + middle_tokens)
@@ -1301,7 +1294,7 @@ def logit_bias_processor(
                         logits = self._scores[token_offset - 1, :]
                         current_logprobs = Llama.logits_to_logprobs(logits).tolist()
                         sorted_logprobs = sorted(
-                                zip(current_logprobs, range(len(current_logprobs))),
+                                zip(current_logprobs, range(len(current_logprobs)), strict=False),
                                 reverse=True,
                             )
                         top_logprob = {
@@ -1420,7 +1413,7 @@ def logit_bias_processor(
                     ),
                 )
 
-                logprobs_or_none: Optional[CompletionLogprobs] = None
+                logprobs_or_none: CompletionLogprobs | None = None
                 if logprobs is not None:
                     if token == bos_token_id:
                         continue
@@ -1438,7 +1431,7 @@ def logit_bias_processor(
                     logits = self._scores[token_offset, :]
                     current_logprobs = Llama.logits_to_logprobs(logits).tolist()
                     sorted_logprobs = sorted(
-                            zip(current_logprobs, range(len(current_logprobs))),
+                            zip(current_logprobs, range(len(current_logprobs)), strict=False),
                             reverse=True,
                         )
                     top_logprob = {
@@ -1529,14 +1522,14 @@ def logit_bias_processor(
         if suffix_token_id < 0 and suffix is not None:
             text_str = text_str + suffix
 
-        logprobs_or_none: Optional[CompletionLogprobs] = None
+        logprobs_or_none: CompletionLogprobs | None = None
         if logprobs is not None:
             text_offset = 0 if echo else len(prompt)
             token_offset = 0 if echo else len(prompt_tokens[1:])
-            text_offsets: List[int] = []
-            token_logprobs: List[Optional[float]] = []
-            tokens: List[str] = []
-            top_logprobs: List[Optional[Dict[str, float]]] = []
+            text_offsets: list[int] = []
+            token_logprobs: list[float | None] = []
+            tokens: list[str] = []
+            top_logprobs: list[dict[str, float] | None] = []
 
             if echo:
                 # Remove leading BOS token if exists
@@ -1556,7 +1549,7 @@ def logit_bias_processor(
             all_logprobs = Llama.logits_to_logprobs(self._scores)[token_offset:]
             # TODO: may be able to change this loop to use np.take_along_dim
             for idx, (token, token_str, logprobs_token) in enumerate(
-                zip(all_tokens, all_token_strs, all_logprobs),
+                zip(all_tokens, all_token_strs, all_logprobs, strict=False),
             ):
                 if token == bos_token_id:
                     continue
@@ -1570,10 +1563,10 @@ def logit_bias_processor(
                 )
                 tokens.append(token_str)
                 sorted_logprobs = sorted(
-                        zip(logprobs_token, range(len(logprobs_token))), reverse=True,
+                        zip(logprobs_token, range(len(logprobs_token)), strict=False), reverse=True,
                     )
                 token_logprobs.append(logprobs_token[int(token)])
-                top_logprob: Optional[Dict[str, float]] = {
+                top_logprob: dict[str, float] | None = {
                     self.detokenize([i], prev_tokens=all_tokens[:idx]).decode(
                         "utf-8", errors="ignore",
                     ): logprob
@@ -1616,32 +1609,32 @@ def logit_bias_processor(
 
     def create_completion(
         self,
-        prompt: Union[str, List[int]],
-        suffix: Optional[str] = None,
-        max_tokens: Optional[int] = 16,
+        prompt: str | list[int],
+        suffix: str | None = None,
+        max_tokens: int | None = 16,
         temperature: float = 0.8,
         top_p: float = 0.95,
         min_p: float = 0.05,
         typical_p: float = 1.0,
-        logprobs: Optional[int] = None,
+        logprobs: int | None = None,
         echo: bool = False,
-        stop: Optional[Union[str, List[str]]] = [],
+        stop: str | list[str] | None = [],
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
         repeat_penalty: float = 1.0,
         top_k: int = 40,
         stream: bool = False,
-        seed: Optional[int] = None,
+        seed: int | None = None,
         tfs_z: float = 1.0,
         mirostat_mode: int = 0,
         mirostat_tau: float = 5.0,
         mirostat_eta: float = 0.1,
-        model: Optional[str] = None,
-        stopping_criteria: Optional[StoppingCriteriaList] = None,
-        logits_processor: Optional[LogitsProcessorList] = None,
-        grammar: Optional[LlamaGrammar] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-    ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]:
+        model: str | None = None,
+        stopping_criteria: StoppingCriteriaList | None = None,
+        logits_processor: LogitsProcessorList | None = None,
+        grammar: LlamaGrammar | None = None,
+        logit_bias: dict[str, float] | None = None,
+    ) -> CreateCompletionResponse | Iterator[CreateCompletionStreamResponse]:
         """Generate text from a prompt.
 
         Args:
@@ -1714,31 +1707,31 @@ def create_completion(
     def __call__(
         self,
         prompt: str,
-        suffix: Optional[str] = None,
-        max_tokens: Optional[int] = 16,
+        suffix: str | None = None,
+        max_tokens: int | None = 16,
         temperature: float = 0.8,
         top_p: float = 0.95,
         min_p: float = 0.05,
         typical_p: float = 1.0,
-        logprobs: Optional[int] = None,
+        logprobs: int | None = None,
         echo: bool = False,
-        stop: Optional[Union[str, List[str]]] = [],
+        stop: str | list[str] | None = [],
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
         repeat_penalty: float = 1.0,
         top_k: int = 40,
         stream: bool = False,
-        seed: Optional[int] = None,
+        seed: int | None = None,
         tfs_z: float = 1.0,
         mirostat_mode: int = 0,
         mirostat_tau: float = 5.0,
         mirostat_eta: float = 0.1,
-        model: Optional[str] = None,
-        stopping_criteria: Optional[StoppingCriteriaList] = None,
-        logits_processor: Optional[LogitsProcessorList] = None,
-        grammar: Optional[LlamaGrammar] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-    ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]:
+        model: str | None = None,
+        stopping_criteria: StoppingCriteriaList | None = None,
+        logits_processor: LogitsProcessorList | None = None,
+        grammar: LlamaGrammar | None = None,
+        logit_bias: dict[str, float] | None = None,
+    ) -> CreateCompletionResponse | Iterator[CreateCompletionStreamResponse]:
         """Generate text from a prompt.
 
         Args:
@@ -1805,21 +1798,21 @@ def __call__(
 
     def create_chat_completion(
         self,
-        messages: List[ChatCompletionRequestMessage],
-        functions: Optional[List[ChatCompletionFunction]] = None,
-        function_call: Optional[ChatCompletionRequestFunctionCall] = None,
-        tools: Optional[List[ChatCompletionTool]] = None,
-        tool_choice: Optional[ChatCompletionToolChoiceOption] = None,
+        messages: list[ChatCompletionRequestMessage],
+        functions: list[ChatCompletionFunction] | None = None,
+        function_call: ChatCompletionRequestFunctionCall | None = None,
+        tools: list[ChatCompletionTool] | None = None,
+        tool_choice: ChatCompletionToolChoiceOption | None = None,
         temperature: float = 0.2,
         top_p: float = 0.95,
         top_k: int = 40,
         min_p: float = 0.05,
         typical_p: float = 1.0,
         stream: bool = False,
-        stop: Optional[Union[str, List[str]]] = [],
-        seed: Optional[int] = None,
-        response_format: Optional[ChatCompletionRequestResponseFormat] = None,
-        max_tokens: Optional[int] = None,
+        stop: str | list[str] | None = [],
+        seed: int | None = None,
+        response_format: ChatCompletionRequestResponseFormat | None = None,
+        max_tokens: int | None = None,
         presence_penalty: float = 0.0,
         frequency_penalty: float = 0.0,
         repeat_penalty: float = 1.0,
@@ -1827,15 +1820,13 @@ def create_chat_completion(
         mirostat_mode: int = 0,
         mirostat_tau: float = 5.0,
         mirostat_eta: float = 0.1,
-        model: Optional[str] = None,
-        logits_processor: Optional[LogitsProcessorList] = None,
-        grammar: Optional[LlamaGrammar] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        top_logprobs: Optional[int] = None,
-    ) -> Union[
-        CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse],
-    ]:
+        model: str | None = None,
+        logits_processor: LogitsProcessorList | None = None,
+        grammar: LlamaGrammar | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        top_logprobs: int | None = None,
+    ) -> CreateChatCompletionResponse | Iterator[CreateChatCompletionStreamResponse]:
         """Generate a chat completion from a list of messages.
 
         Args:
@@ -2084,7 +2075,7 @@ def __del__(self) -> None:
 
     @staticmethod
     def logits_to_logprobs(
-        logits: Union[npt.NDArray[np.single], List], axis: int = -1,
+        logits: npt.NDArray[np.single] | list, axis: int = -1,
     ) -> npt.NDArray[np.single]:
         # https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.log_softmax.html
         logits_maxs: np.ndarray = np.amax(logits, axis=axis, keepdims=True)
@@ -2103,7 +2094,7 @@ def logits_to_logprobs(
     @staticmethod
     def longest_token_prefix(a: Sequence[int], b: Sequence[int]):
         longest_prefix = 0
-        for _a, _b in zip(a, b):
+        for _a, _b in zip(a, b, strict=False):
             if _a == _b:
                 longest_prefix += 1
             else:
@@ -2114,10 +2105,10 @@ def longest_token_prefix(a: Sequence[int], b: Sequence[int]):
     def from_pretrained(
         cls,
         repo_id: str,
-        filename: Optional[str],
-        local_dir: Optional[Union[str, os.PathLike[str]]] = None,
-        local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
-        cache_dir: Optional[Union[str, os.PathLike[str]]] = None,
+        filename: str | None,
+        local_dir: str | os.PathLike[str] | None = None,
+        local_dir_use_symlinks: bool | Literal["auto"] = "auto",
+        cache_dir: str | os.PathLike[str] | None = None,
         **kwargs: Any,
     ) -> Llama:
         """Create a Llama model from a pretrained model name or path.
@@ -2152,7 +2143,7 @@ def from_pretrained(
         ]
 
         # split each file into repo_id, subfolder, filename
-        file_list: List[str] = []
+        file_list: list[str] = []
         for file in files:
             rel_path = Path(file).relative_to(repo_id)
             file_list.append(str(rel_path))
@@ -2226,7 +2217,7 @@ def __init__(
 ]
 
 
-class LogitsProcessorList(List[LogitsProcessor]):
+class LogitsProcessorList(list[LogitsProcessor]):
     def __call__(
         self, input_ids: npt.NDArray[np.intc], scores: npt.NDArray[np.single],
     ) -> npt.NDArray[np.single]:
@@ -2238,7 +2229,7 @@ def __call__(
 StoppingCriteria = Callable[[npt.NDArray[np.intc], npt.NDArray[np.single]], bool]
 
 
-class StoppingCriteriaList(List[StoppingCriteria]):
+class StoppingCriteriaList(list[StoppingCriteria]):
     def __call__(
         self, input_ids: npt.NDArray[np.intc], logits: npt.NDArray[np.single],
     ) -> bool:
diff --git a/llama_cpp/llama_cache.py b/llama_cpp/llama_cache.py
index 8a9f18f37..7bf4084db 100644
--- a/llama_cpp/llama_cache.py
+++ b/llama_cpp/llama_cache.py
@@ -1,9 +1,9 @@
 import sys
 from abc import ABC, abstractmethod
 from collections import OrderedDict
+from collections.abc import Sequence
 from typing import (
     Optional,
-    Sequence,
     Tuple,
 )
 
@@ -27,8 +27,8 @@ def cache_size(self) -> int:
 
     def _find_longest_prefix_key(
         self,
-        key: Tuple[int, ...],
-    ) -> Optional[Tuple[int, ...]]:
+        key: tuple[int, ...],
+    ) -> tuple[int, ...] | None:
         pass
 
     @abstractmethod
@@ -52,7 +52,7 @@ class LlamaRAMCache(BaseLlamaCache):
     def __init__(self, capacity_bytes: int = (2 << 30)):
         super().__init__(capacity_bytes)
         self.capacity_bytes = capacity_bytes
-        self.cache_state: OrderedDict[Tuple[int, ...], llama_cpp.llama.LlamaState] = (
+        self.cache_state: OrderedDict[tuple[int, ...], llama_cpp.llama.LlamaState] = (
             OrderedDict()
         )
 
@@ -62,8 +62,8 @@ def cache_size(self):
 
     def _find_longest_prefix_key(
         self,
-        key: Tuple[int, ...],
-    ) -> Optional[Tuple[int, ...]]:
+        key: tuple[int, ...],
+    ) -> tuple[int, ...] | None:
         min_len = 0
         min_key = None
         keys = (
@@ -116,10 +116,10 @@ def cache_size(self):
 
     def _find_longest_prefix_key(
         self,
-        key: Tuple[int, ...],
-    ) -> Optional[Tuple[int, ...]]:
+        key: tuple[int, ...],
+    ) -> tuple[int, ...] | None:
         min_len = 0
-        min_key: Optional[Tuple[int, ...]] = None
+        min_key: tuple[int, ...] | None = None
         for k in self.cache.iterkeys():  # type: ignore
             prefix_len = llama_cpp.llama.Llama.longest_token_prefix(k, key)
             if prefix_len > min_len:
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index a49bf6593..d7d672351 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -7,11 +7,11 @@
 import random
 import string
 import sys
+from collections.abc import Iterator
 from contextlib import ExitStack
 from typing import (
     Any,
     Dict,
-    Iterator,
     List,
     Literal,
     Optional,
@@ -65,26 +65,24 @@ def __call__(
         # llama.cpp instance
         llama: llama.Llama,
         # openai api parameters
-        messages: List[llama_types.ChatCompletionRequestMessage],
-        functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
-        function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
-        tools: Optional[List[llama_types.ChatCompletionTool]] = None,
-        tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
+        messages: list[llama_types.ChatCompletionRequestMessage],
+        functions: list[llama_types.ChatCompletionFunction] | None = None,
+        function_call: llama_types.ChatCompletionRequestFunctionCall | None = None,
+        tools: list[llama_types.ChatCompletionTool] | None = None,
+        tool_choice: llama_types.ChatCompletionToolChoiceOption | None = None,
         temperature: float = 0.2,
         top_p: float = 0.95,
         top_k: int = 40,
         stream: bool = False,
-        stop: Optional[Union[str, List[str]]] = [],
-        seed: Optional[int] = None,
-        response_format: Optional[
-            llama_types.ChatCompletionRequestResponseFormat
-        ] = None,
-        max_tokens: Optional[int] = None,
+        stop: str | list[str] | None = [],
+        seed: int | None = None,
+        response_format: llama_types.ChatCompletionRequestResponseFormat | None = None,
+        max_tokens: int | None = None,
         presence_penalty: float = 0.0,
         frequency_penalty: float = 0.0,
         repeat_penalty: float = 1.1,
-        model: Optional[str] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
+        model: str | None = None,
+        logit_bias: dict[str, float] | None = None,
         # llama.cpp parameters
         min_p: float = 0.05,
         typical_p: float = 1.0,
@@ -92,15 +90,12 @@ def __call__(
         mirostat_mode: int = 0,
         mirostat_tau: float = 5.0,
         mirostat_eta: float = 0.1,
-        logits_processor: Optional[llama.LogitsProcessorList] = None,
-        grammar: Optional[llama.LlamaGrammar] = None,
-        logprobs: Optional[bool] = None,
-        top_logprobs: Optional[int] = None,
+        logits_processor: llama.LogitsProcessorList | None = None,
+        grammar: llama.LlamaGrammar | None = None,
+        logprobs: bool | None = None,
+        top_logprobs: int | None = None,
         **kwargs,  # type: ignore
-    ) -> Union[
-        llama_types.CreateChatCompletionResponse,
-        Iterator[llama_types.CreateChatCompletionStreamResponse],
-    ]: ...
+    ) -> llama_types.CreateChatCompletionResponse | Iterator[llama_types.CreateChatCompletionStreamResponse]: ...
 
 
 class LlamaChatCompletionHandlerNotFoundException(Exception):
@@ -108,7 +103,7 @@ class LlamaChatCompletionHandlerNotFoundException(Exception):
 
 
 class LlamaChatCompletionHandlerRegistry(Singleton):
-    _chat_handlers: Dict[str, LlamaChatCompletionHandler] = {}
+    _chat_handlers: dict[str, LlamaChatCompletionHandler] = {}
 
     def register_chat_completion_handler(
         self,
@@ -166,8 +161,8 @@ class ChatFormatterResponse:
     stop contains the stop token or list of stop tokens to use for the chat format."""
 
     prompt: str
-    stop: Optional[Union[str, List[str]]] = None
-    stopping_criteria: Optional[llama.StoppingCriteriaList] = None
+    stop: str | list[str] | None = None
+    stopping_criteria: llama.StoppingCriteriaList | None = None
     added_special: bool = False
 
 
@@ -180,7 +175,7 @@ class ChatFormatter(Protocol):
     def __call__(
         self,
         *,
-        messages: List[llama_types.ChatCompletionRequestMessage],
+        messages: list[llama_types.ChatCompletionRequestMessage],
         **kwargs: Any,
     ) -> ChatFormatterResponse: ...
 
@@ -192,7 +187,7 @@ def __init__(
         eos_token: str,
         bos_token: str,
         add_generation_prompt: bool = True,
-        stop_token_ids: Optional[List[int]] = None,
+        stop_token_ids: list[int] | None = None,
     ):
         """A chat formatter that uses jinja2 templates to format the prompt."""
         self.template = template
@@ -212,11 +207,11 @@ def __init__(
     def __call__(
         self,
         *,
-        messages: List[llama_types.ChatCompletionRequestMessage],
-        functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
-        function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
-        tools: Optional[List[llama_types.ChatCompletionTool]] = None,
-        tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
+        messages: list[llama_types.ChatCompletionRequestMessage],
+        functions: list[llama_types.ChatCompletionFunction] | None = None,
+        function_call: llama_types.ChatCompletionRequestFunctionCall | None = None,
+        tools: list[llama_types.ChatCompletionTool] | None = None,
+        tool_choice: llama_types.ChatCompletionToolChoiceOption | None = None,
         **kwargs: Any,
     ) -> ChatFormatterResponse:
         def raise_exception(message: str):
@@ -323,28 +318,19 @@ def _convert_text_completion_chunks_to_chat(
 
 
 def _convert_completion_to_chat(
-    completion_or_chunks: Union[
-        llama_types.CreateCompletionResponse,
-        Iterator[llama_types.CreateCompletionStreamResponse],
-    ],
+    completion_or_chunks: llama_types.CreateCompletionResponse | Iterator[llama_types.CreateCompletionStreamResponse],
     stream: bool = False,
-) -> Union[
-    llama_types.CreateChatCompletionResponse, Iterator[llama_types.ChatCompletionChunk],
-]:
+) -> llama_types.CreateChatCompletionResponse | Iterator[llama_types.ChatCompletionChunk]:
     if stream:
         chunks: Iterator[llama_types.CreateCompletionStreamResponse] = completion_or_chunks  # type: ignore
         return _convert_text_completion_chunks_to_chat(chunks)
-    else:
-        completion: llama_types.Completion = completion_or_chunks  # type: ignore
-        return _convert_text_completion_to_chat(completion)
+    completion: llama_types.Completion = completion_or_chunks  # type: ignore
+    return _convert_text_completion_to_chat(completion)
 
 
 def _convert_completion_to_chat_function(
     tool_name: str,
-    completion_or_chunks: Union[
-        llama_types.CreateCompletionResponse,
-        Iterator[llama_types.CreateCompletionStreamResponse],
-    ],
+    completion_or_chunks: llama_types.CreateCompletionResponse | Iterator[llama_types.CreateCompletionStreamResponse],
     stream: bool,
 ):
     if not stream:
@@ -385,80 +371,42 @@ def _convert_completion_to_chat_function(
             "usage": completion["usage"],
         }
         return chat_completion
-    else:
-        chunks: Iterator[llama_types.CreateCompletionStreamResponse] = completion_or_chunks  # type: ignore
-
-        def _stream_response_to_function_stream(
-            chunks: Iterator[llama_types.CreateCompletionStreamResponse],
-        ) -> Iterator[llama_types.CreateChatCompletionStreamResponse]:
-            # blank first message
-            first = True
-            id_ = None
-            created = None
-            model = None
-            tool_id = None
-            for chunk in chunks:
-                if first:
-                    id_ = "chat" + chunk["id"]
-                    created = chunk["created"]
-                    model = chunk["model"]
-                    tool_id = "call_" + "_0_" + tool_name + "_" + chunk["id"]
-                    yield {
-                        "id": id_,
-                        "object": "chat.completion.chunk",
-                        "created": created,
-                        "model": model,
-                        "choices": [
-                            {
-                                "index": 0,
-                                "finish_reason": None,
-                                "logprobs": None,
-                                "delta": {
-                                    "role": "assistant",
-                                    "content": None,
-                                    "function_call": None,
-                                    "tool_calls": None,
-                                },
-                            },
-                        ],
-                    }
-                    yield {
-                        "id": "chat" + chunk["id"],
-                        "object": "chat.completion.chunk",
-                        "created": chunk["created"],
-                        "model": chunk["model"],
-                        "choices": [
-                            {
-                                "index": 0,
-                                "finish_reason": None,
-                                "logprobs": chunk["choices"][0]["logprobs"],
-                                "delta": {
-                                    "role": None,
-                                    "content": None,
-                                    "function_call": {
-                                        "name": tool_name,
-                                        "arguments": chunk["choices"][0]["text"],
-                                    },
-                                    "tool_calls": [
-                                        {
-                                            "index": 0,
-                                            "id": tool_id,
-                                            "type": "function",
-                                            "function": {
-                                                "name": tool_name,
-                                                "arguments": chunk["choices"][0][
-                                                    "text"
-                                                ],
-                                            },
-                                        },
-                                    ],
-                                },
+    chunks: Iterator[llama_types.CreateCompletionStreamResponse] = completion_or_chunks  # type: ignore
+
+    def _stream_response_to_function_stream(
+        chunks: Iterator[llama_types.CreateCompletionStreamResponse],
+    ) -> Iterator[llama_types.CreateChatCompletionStreamResponse]:
+        # blank first message
+        first = True
+        id_ = None
+        created = None
+        model = None
+        tool_id = None
+        for chunk in chunks:
+            if first:
+                id_ = "chat" + chunk["id"]
+                created = chunk["created"]
+                model = chunk["model"]
+                tool_id = "call_" + "_0_" + tool_name + "_" + chunk["id"]
+                yield {
+                    "id": id_,
+                    "object": "chat.completion.chunk",
+                    "created": created,
+                    "model": model,
+                    "choices": [
+                        {
+                            "index": 0,
+                            "finish_reason": None,
+                            "logprobs": None,
+                            "delta": {
+                                "role": "assistant",
+                                "content": None,
+                                "function_call": None,
+                                "tool_calls": None,
                             },
-                        ],
-                    }
-                    first = False
-                    continue
-                assert tool_id is not None
+                        },
+                    ],
+                }
                 yield {
                     "id": "chat" + chunk["id"],
                     "object": "chat.completion.chunk",
@@ -483,7 +431,9 @@ def _stream_response_to_function_stream(
                                         "type": "function",
                                         "function": {
                                             "name": tool_name,
-                                            "arguments": chunk["choices"][0]["text"],
+                                            "arguments": chunk["choices"][0][
+                                                "text"
+                                            ],
                                         },
                                     },
                                 ],
@@ -491,29 +441,64 @@ def _stream_response_to_function_stream(
                         },
                     ],
                 }
-
-            if id_ is not None and created is not None and model is not None:
-                yield {
-                    "id": id_,
-                    "object": "chat.completion.chunk",
-                    "created": created,
-                    "model": model,
-                    "choices": [
-                        {
-                            "index": 0,
-                            "finish_reason": "tool_calls",
-                            "logprobs": None,
-                            "delta": {
-                                "role": None,
-                                "content": None,
-                                "function_call": None,
-                                "tool_calls": None,
+                first = False
+                continue
+            assert tool_id is not None
+            yield {
+                "id": "chat" + chunk["id"],
+                "object": "chat.completion.chunk",
+                "created": chunk["created"],
+                "model": chunk["model"],
+                "choices": [
+                    {
+                        "index": 0,
+                        "finish_reason": None,
+                        "logprobs": chunk["choices"][0]["logprobs"],
+                        "delta": {
+                            "role": None,
+                            "content": None,
+                            "function_call": {
+                                "name": tool_name,
+                                "arguments": chunk["choices"][0]["text"],
                             },
+                            "tool_calls": [
+                                {
+                                    "index": 0,
+                                    "id": tool_id,
+                                    "type": "function",
+                                    "function": {
+                                        "name": tool_name,
+                                        "arguments": chunk["choices"][0]["text"],
+                                    },
+                                },
+                            ],
                         },
-                    ],
-                }
+                    },
+                ],
+            }
+
+        if id_ is not None and created is not None and model is not None:
+            yield {
+                "id": id_,
+                "object": "chat.completion.chunk",
+                "created": created,
+                "model": model,
+                "choices": [
+                    {
+                        "index": 0,
+                        "finish_reason": "tool_calls",
+                        "logprobs": None,
+                        "delta": {
+                            "role": None,
+                            "content": None,
+                            "function_call": None,
+                            "tool_calls": None,
+                        },
+                    },
+                ],
+            }
 
-        return _stream_response_to_function_stream(chunks)
+    return _stream_response_to_function_stream(chunks)
 
 
 def chat_formatter_to_chat_completion_handler(
@@ -522,23 +507,21 @@ def chat_formatter_to_chat_completion_handler(
     def chat_completion_handler(
         *,
         llama: llama.Llama,
-        messages: List[llama_types.ChatCompletionRequestMessage],
-        functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
-        function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
-        tools: Optional[List[llama_types.ChatCompletionTool]] = None,
-        tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
+        messages: list[llama_types.ChatCompletionRequestMessage],
+        functions: list[llama_types.ChatCompletionFunction] | None = None,
+        function_call: llama_types.ChatCompletionRequestFunctionCall | None = None,
+        tools: list[llama_types.ChatCompletionTool] | None = None,
+        tool_choice: llama_types.ChatCompletionToolChoiceOption | None = None,
         temperature: float = 0.2,
         top_p: float = 0.95,
         top_k: int = 40,
         min_p: float = 0.05,
         typical_p: float = 1.0,
         stream: bool = False,
-        stop: Optional[Union[str, List[str]]] = [],
-        seed: Optional[int] = None,
-        response_format: Optional[
-            llama_types.ChatCompletionRequestResponseFormat
-        ] = None,
-        max_tokens: Optional[int] = None,
+        stop: str | list[str] | None = [],
+        seed: int | None = None,
+        response_format: llama_types.ChatCompletionRequestResponseFormat | None = None,
+        max_tokens: int | None = None,
         presence_penalty: float = 0.0,
         frequency_penalty: float = 0.0,
         repeat_penalty: float = 1.1,
@@ -546,17 +529,14 @@ def chat_completion_handler(
         mirostat_mode: int = 0,
         mirostat_tau: float = 5.0,
         mirostat_eta: float = 0.1,
-        model: Optional[str] = None,
-        logits_processor: Optional[llama.LogitsProcessorList] = None,
-        grammar: Optional[llama.LlamaGrammar] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        top_logprobs: Optional[int] = None,
+        model: str | None = None,
+        logits_processor: llama.LogitsProcessorList | None = None,
+        grammar: llama.LlamaGrammar | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        top_logprobs: int | None = None,
         **kwargs,  # type: ignore
-    ) -> Union[
-        llama_types.CreateChatCompletionResponse,
-        Iterator[llama_types.CreateChatCompletionStreamResponse],
-    ]:
+    ) -> llama_types.CreateChatCompletionResponse | Iterator[llama_types.CreateChatCompletionStreamResponse]:
         result = chat_formatter(
             messages=messages,
             functions=functions,
@@ -666,7 +646,7 @@ def chat_completion_handler(
 
 
 def hf_autotokenizer_to_chat_formatter(
-    pretrained_model_name_or_path: Union[str, os.PathLike[str]],
+    pretrained_model_name_or_path: str | os.PathLike[str],
 ) -> ChatFormatter:
     # https://huggingface.co/docs/transformers/main/chat_templating
     # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1#instruction-format
@@ -676,7 +656,7 @@ def hf_autotokenizer_to_chat_formatter(
     tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)  # type: ignore
 
     def format_autotokenizer(
-        messages: List[llama_types.ChatCompletionRequestMessage],
+        messages: list[llama_types.ChatCompletionRequestMessage],
         **kwargs: Any,
     ) -> ChatFormatterResponse:
         tokenizer.use_default_system_prompt = False  # type: ignore
@@ -691,14 +671,14 @@ def format_autotokenizer(
 
 
 def hf_autotokenizer_to_chat_completion_handler(
-    pretrained_model_name_or_path: Union[str, os.PathLike[str]],
+    pretrained_model_name_or_path: str | os.PathLike[str],
 ) -> LlamaChatCompletionHandler:
     chat_formatter = hf_autotokenizer_to_chat_formatter(pretrained_model_name_or_path)
     return chat_formatter_to_chat_completion_handler(chat_formatter)
 
 
 def hf_tokenizer_config_to_chat_formatter(
-    tokenizer_config: Dict[str, Any],
+    tokenizer_config: dict[str, Any],
     add_generation_prompt: bool = True,
 ) -> ChatFormatter:
     assert isinstance(tokenizer_config, dict)
@@ -721,7 +701,7 @@ def hf_tokenizer_config_to_chat_formatter(
     ).from_string(chat_template)
 
     def format_tokenizer_config(
-        messages: List[llama_types.ChatCompletionRequestMessage],
+        messages: list[llama_types.ChatCompletionRequestMessage],
         **kwargs: Any,
     ) -> ChatFormatterResponse:
         # TODO: veryify this is correct
@@ -746,7 +726,7 @@ def format_tokenizer_config(
 
 
 def hf_tokenizer_config_to_chat_completion_handler(
-    tokenizer_config: Dict[str, Any],
+    tokenizer_config: dict[str, Any],
     add_generation_prompt: bool = True,
 ) -> LlamaChatCompletionHandler:
     chat_formatter = hf_tokenizer_config_to_chat_formatter(
@@ -755,7 +735,7 @@ def hf_tokenizer_config_to_chat_completion_handler(
     return chat_formatter_to_chat_completion_handler(chat_formatter)
 
 
-def guess_chat_format_from_gguf_metadata(metadata: Dict[str, str]) -> Optional[str]:
+def guess_chat_format_from_gguf_metadata(metadata: dict[str, str]) -> str | None:
     if "tokenizer.chat_template" not in metadata:
         return None
 
@@ -779,7 +759,7 @@ def guess_chat_format_from_gguf_metadata(metadata: Dict[str, str]) -> Optional[s
 
 
 def _get_system_message(
-    messages: List[llama_types.ChatCompletionRequestMessage],
+    messages: list[llama_types.ChatCompletionRequestMessage],
 ) -> str:
     """Get the first system message."""
     for message in messages:
@@ -789,11 +769,11 @@ def _get_system_message(
 
 
 def _map_roles(
-    messages: List[llama_types.ChatCompletionRequestMessage],
-    role_map: Dict[str, str],
-) -> List[Tuple[str, Optional[str]]]:
+    messages: list[llama_types.ChatCompletionRequestMessage],
+    role_map: dict[str, str],
+) -> list[tuple[str, str | None]]:
     """Map the message roles."""
-    output: List[Tuple[str, Optional[str]]] = []
+    output: list[tuple[str, str | None]] = []
     for message in messages:
         role = message["role"]
         if role in role_map:
@@ -805,7 +785,7 @@ def _map_roles(
 
 
 def _format_llama2(
-    system_message: str, messages: List[Tuple[str, Optional[str]]], sep: str, sep2: str,
+    system_message: str, messages: list[tuple[str, str | None]], sep: str, sep2: str,
 ) -> str:
     """Format the prompt with the llama2 style."""
     seps = [sep, sep2]
@@ -822,7 +802,7 @@ def _format_llama2(
 
 
 def _format_add_colon_single(
-    system_message: str, messages: List[Tuple[str, Optional[str]]], sep: str,
+    system_message: str, messages: list[tuple[str, str | None]], sep: str,
 ) -> str:
     """Format the prompt with the add-colon-single style."""
     ret = system_message + sep
@@ -835,7 +815,7 @@ def _format_add_colon_single(
 
 
 def _format_add_colon_two(
-    system_message: str, messages: List[Tuple[str, Optional[str]]], sep: str, sep2: str,
+    system_message: str, messages: list[tuple[str, str | None]], sep: str, sep2: str,
 ) -> str:
     """Format the prompt with the add-colon-two style."""
     seps = [sep, sep2]
@@ -849,7 +829,7 @@ def _format_add_colon_two(
 
 
 def _format_no_colon_single(
-    system_message: str, messages: List[Tuple[str, Optional[str]]], sep: str,
+    system_message: str, messages: list[tuple[str, str | None]], sep: str,
 ) -> str:
     """Format the prompt with the no-colon-single style."""
     ret = system_message
@@ -862,7 +842,7 @@ def _format_no_colon_single(
 
 
 def _format_add_colon_space_single(
-    system_message: str, messages: List[Tuple[str, Optional[str]]], sep: str,
+    system_message: str, messages: list[tuple[str, str | None]], sep: str,
 ) -> str:
     """Format the prompt with the add-colon-space-single style."""
     ret = system_message + sep
@@ -875,7 +855,7 @@ def _format_add_colon_space_single(
 
 
 def _format_chatml(
-    system_message: str, messages: List[Tuple[str, Optional[str]]], sep: str,
+    system_message: str, messages: list[tuple[str, str | None]], sep: str,
 ) -> str:
     """Format the prompt with the chatml style."""
     ret = "" if system_message == "" else system_message + sep + "\n"
@@ -888,7 +868,7 @@ def _format_chatml(
 
 
 def _format_chatglm3(
-    system_message: str, messages: List[Tuple[str, Optional[str]]], sep: str,
+    system_message: str, messages: list[tuple[str, str | None]], sep: str,
 ) -> str:
     """Format the prompt with the chatglm3 style."""
     ret = ""
@@ -916,8 +896,7 @@ def _grammar_for_json_schema(
     except Exception as e:
         if fallback_to_json:
             return _grammar_for_json(verbose=verbose)
-        else:
-            raise e
+        raise e
 
 
 def _grammar_for_response_format(
@@ -931,8 +910,7 @@ def _grammar_for_response_format(
         return _grammar_for_json_schema(
             json.dumps(response_format["schema"]), verbose=verbose,
         )
-    else:
-        return _grammar_for_json(verbose=verbose)
+    return _grammar_for_json(verbose=verbose)
 
 
 ### Chat Formats ###
@@ -953,7 +931,7 @@ def decorator(f: ChatFormatter):
 # system prompt is "embedded" in the first message
 @register_chat_format("llama-2")
 def format_llama2(
-    messages: List[llama_types.ChatCompletionRequestMessage],
+    messages: list[llama_types.ChatCompletionRequestMessage],
     **kwargs: Any,
 ) -> ChatFormatterResponse:
     _system_template = "[INST] <<SYS>>\n{system_message}\n<</SYS>>"
@@ -970,7 +948,7 @@ def format_llama2(
 # https://github.com/meta-llama/llama3/blob/main/llama/tokenizer.py#L202-L229
 @register_chat_format("llama-3")
 def format_llama3(
-    messages: List[llama_types.ChatCompletionRequestMessage],
+    messages: list[llama_types.ChatCompletionRequestMessage],
     **kwargs: Any,
 ) -> ChatFormatterResponse:
     _roles = dict(
@@ -987,7 +965,7 @@ def format_llama3(
 
 @register_chat_format("alpaca")
 def format_alpaca(
-    messages: List[llama_types.ChatCompletionRequestMessage],
+    messages: list[llama_types.ChatCompletionRequestMessage],
     **kwargs: Any,
 ) -> ChatFormatterResponse:
     _roles = dict(user="### Instruction", assistant="### Response")
@@ -1001,7 +979,7 @@ def format_alpaca(
 
 @register_chat_format("qwen")
 def format_qwen(
-    messages: List[llama_types.ChatCompletionRequestMessage],
+    messages: list[llama_types.ChatCompletionRequestMessage],
     **kwargs: Any,
 ) -> ChatFormatterResponse:
     _roles = dict(user="<|im_start|>user", assistant="<|im_start|>assistant")
@@ -1018,7 +996,7 @@ def format_qwen(
 
 @register_chat_format("vicuna")
 def format(
-    messages: List[llama_types.ChatCompletionRequestMessage],
+    messages: list[llama_types.ChatCompletionRequestMessage],
     **kwargs: Any,
 ) -> ChatFormatterResponse:
     _system_message = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions."
@@ -1034,7 +1012,7 @@ def format(
 
 @register_chat_format("oasst_llama")
 def format_oasst_llama(
-    messages: List[llama_types.ChatCompletionRequestMessage],
+    messages: list[llama_types.ChatCompletionRequestMessage],
     **kwargs: Any,
 ) -> ChatFormatterResponse:
     _system_template = "[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n"
@@ -1050,7 +1028,7 @@ def format_oasst_llama(
 
 @register_chat_format("baichuan-2")
 def format_baichuan2(
-    messages: List[llama_types.ChatCompletionRequestMessage],
+    messages: list[llama_types.ChatCompletionRequestMessage],
     **kwargs: Any,
 ) -> ChatFormatterResponse:
     _system_template = "{system_message}"
@@ -1066,7 +1044,7 @@ def format_baichuan2(
 
 @register_chat_format("baichuan")
 def format_baichuan(
-    messages: List[llama_types.ChatCompletionRequestMessage],
+    messages: list[llama_types.ChatCompletionRequestMessage],
     **kwargs: Any,
 ) -> ChatFormatterResponse:
     _system_template = "{system_message}"
@@ -1082,7 +1060,7 @@ def format_baichuan(
 
 @register_chat_format("openbuddy")
 def format_openbuddy(
-    messages: List[llama_types.ChatCompletionRequestMessage],
+    messages: list[llama_types.ChatCompletionRequestMessage],
     **kwargs: Any,
 ) -> ChatFormatterResponse:
     _system_message = """You are a helpful, respectful and honest INTP-T AI Assistant named Buddy. You are talking to a human User.
@@ -1104,7 +1082,7 @@ def format_openbuddy(
 
 @register_chat_format("redpajama-incite")
 def format_redpajama_incite(
-    messages: List[llama_types.ChatCompletionRequestMessage],
+    messages: list[llama_types.ChatCompletionRequestMessage],
     **kwargs: Any,
 ) -> ChatFormatterResponse:
     _system_message = _get_system_message(messages)
@@ -1120,7 +1098,7 @@ def format_redpajama_incite(
 
 @register_chat_format("snoozy")
 def format_snoozy(
-    messages: List[llama_types.ChatCompletionRequestMessage],
+    messages: list[llama_types.ChatCompletionRequestMessage],
     **kwargs: Any,
 ) -> ChatFormatterResponse:
     system_template = "### Instruction:\n{system_message}"
@@ -1142,7 +1120,7 @@ def format_snoozy(
 
 @register_chat_format("phind")
 def format_phind(
-    messages: List[llama_types.ChatCompletionRequestMessage],
+    messages: list[llama_types.ChatCompletionRequestMessage],
     **kwargs: Any,
 ) -> ChatFormatterResponse:
     _roles = dict(user="### User Message", assistant="### Assistant")
@@ -1156,7 +1134,7 @@ def format_phind(
 
 @register_chat_format("intel")
 def format_intel(
-    messages: List[llama_types.ChatCompletionRequestMessage],
+    messages: list[llama_types.ChatCompletionRequestMessage],
     **kwargs: Any,
 ) -> ChatFormatterResponse:
     _roles = dict(user="### User:", assistant="### Assistant:")
@@ -1170,7 +1148,7 @@ def format_intel(
 
 @register_chat_format("open-orca")
 def format_open_orca(
-    messages: List[llama_types.ChatCompletionRequestMessage],
+    messages: list[llama_types.ChatCompletionRequestMessage],
     **kwargs: Any,
 ) -> ChatFormatterResponse:
     system_template = "{system_message}"
@@ -1189,7 +1167,7 @@ def format_open_orca(
     # stop_token_ids=[32000, 32001],  # "<|end_of_turn|>"
     stop_str = "User"
     system_message = system_template.format(system_message=system_message)
-    _messages = _map_roles(messages, dict(zip(roles, roles)))
+    _messages = _map_roles(messages, dict(zip(roles, roles, strict=False)))
     _messages.append((roles[1], None))
     _prompt = _format_add_colon_space_single(system_message, _messages, sep)
     return ChatFormatterResponse(prompt=_prompt, stop=stop_str)
@@ -1197,7 +1175,7 @@ def format_open_orca(
 
 @register_chat_format("mistrallite")
 def format_mistrallite(
-    messages: List[llama_types.ChatCompletionRequestMessage],
+    messages: list[llama_types.ChatCompletionRequestMessage],
     **kwargs: Any,
 ) -> ChatFormatterResponse:
     _roles = dict(user="<|prompter|>", assistant="</s>\n<|assistant|>")
@@ -1213,7 +1191,7 @@ def format_mistrallite(
 
 @register_chat_format("zephyr")
 def format_zephyr(
-    messages: List[llama_types.ChatCompletionRequestMessage],
+    messages: list[llama_types.ChatCompletionRequestMessage],
     **kwargs: Any,
 ) -> ChatFormatterResponse:
     system_template = """<|system|>
@@ -1230,7 +1208,7 @@ def format_zephyr(
 
 @register_chat_format("pygmalion")
 def format_pygmalion(
-    messages: List[llama_types.ChatCompletionRequestMessage],
+    messages: list[llama_types.ChatCompletionRequestMessage],
     **kwargs: Any,
 ) -> ChatFormatterResponse:
     system_template = """<|system|>{system_message}"""
@@ -1246,7 +1224,7 @@ def format_pygmalion(
 
 @register_chat_format("chatml")
 def format_chatml(
-    messages: List[llama_types.ChatCompletionRequestMessage],
+    messages: list[llama_types.ChatCompletionRequestMessage],
     **kwargs: Any,
 ) -> ChatFormatterResponse:
     system_template = """<|im_start|>system
@@ -1263,7 +1241,7 @@ def format_chatml(
 
 @register_chat_format("mistral-instruct")
 def format_mistral_instruct(
-    messages: List[llama_types.ChatCompletionRequestMessage],
+    messages: list[llama_types.ChatCompletionRequestMessage],
     **kwargs: Any,
 ) -> ChatFormatterResponse:
     eos = "</s>"
@@ -1284,7 +1262,7 @@ def format_mistral_instruct(
 
 @register_chat_format("chatglm3")
 def format_chatglm3(
-    messages: List[llama_types.ChatCompletionRequestMessage],
+    messages: list[llama_types.ChatCompletionRequestMessage],
     **kwargs: Any,
 ) -> ChatFormatterResponse:
     system_template = """<|system|>
@@ -1301,7 +1279,7 @@ def format_chatglm3(
 
 @register_chat_format("openchat")
 def format_openchat(
-    messages: List[llama_types.ChatCompletionRequestMessage],
+    messages: list[llama_types.ChatCompletionRequestMessage],
     **kwargs: Any,
 ) -> ChatFormatterResponse:
     system_template = "{system_message}<|end_of_turn|>"
@@ -1343,7 +1321,7 @@ def format_saiga(
 # https://huggingface.co/collections/google/gemma-release-65d5efbccdbb8c4202ec078b
 @register_chat_format("gemma")
 def format_gemma(
-    messages: List[llama_types.ChatCompletionRequestMessage],
+    messages: list[llama_types.ChatCompletionRequestMessage],
     **kwargs: Any,
 ) -> ChatFormatterResponse:
     system_message = _get_system_message(messages)
@@ -1365,20 +1343,20 @@ def format_gemma(
 @register_chat_completion_handler("functionary")
 def functionary_chat_handler(
     llama: llama.Llama,
-    messages: List[llama_types.ChatCompletionRequestMessage],
-    functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
-    function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
-    tools: Optional[List[llama_types.ChatCompletionTool]] = None,
-    tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
+    messages: list[llama_types.ChatCompletionRequestMessage],
+    functions: list[llama_types.ChatCompletionFunction] | None = None,
+    function_call: llama_types.ChatCompletionRequestFunctionCall | None = None,
+    tools: list[llama_types.ChatCompletionTool] | None = None,
+    tool_choice: llama_types.ChatCompletionToolChoiceOption | None = None,
     temperature: float = 0.2,
     top_p: float = 0.95,
     top_k: int = 40,
     min_p: float = 0.05,
     typical_p: float = 1.0,
     stream: bool = False,
-    stop: Optional[Union[str, List[str]]] = [],
-    response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None,
-    max_tokens: Optional[int] = None,
+    stop: str | list[str] | None = [],
+    response_format: llama_types.ChatCompletionRequestResponseFormat | None = None,
+    max_tokens: int | None = None,
     presence_penalty: float = 0.0,
     frequency_penalty: float = 0.0,
     repeat_penalty: float = 1.1,
@@ -1386,15 +1364,15 @@ def functionary_chat_handler(
     mirostat_mode: int = 0,
     mirostat_tau: float = 5.0,
     mirostat_eta: float = 0.1,
-    model: Optional[str] = None,
-    logits_processor: Optional[llama.LogitsProcessorList] = None,
-    grammar: Optional[llama.LlamaGrammar] = None,
+    model: str | None = None,
+    logits_processor: llama.LogitsProcessorList | None = None,
+    grammar: llama.LlamaGrammar | None = None,
     **kwargs,  # type: ignore
-) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
+) -> llama_types.ChatCompletion | Iterator[llama_types.ChatCompletionChunk]:
     SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
 
     def generate_type_definition(
-        param: Dict[str, llama_types.JsonType], indent_level: int, shared_defs,
+        param: dict[str, llama_types.JsonType], indent_level: int, shared_defs,
     ) -> str:
         indent = "  " * indent_level
         if "$ref" in param:
@@ -1403,11 +1381,11 @@ def generate_type_definition(
                 -1
             ]  # Extract the type name from the reference
             return ref_name
-        elif param.get("type") == "array":
+        if param.get("type") == "array":
             items = param.get("items", {})
             item_type = generate_type_definition(items, indent_level + 1, shared_defs)
             return f"Array<{item_type}>"
-        elif param.get("type") == "object":
+        if param.get("type") == "object":
             properties = param.get("properties", {})
             nested_schema = "{\n"
             for nested_param_name, nested_param in properties.items():
@@ -1419,12 +1397,11 @@ def generate_type_definition(
                 )
             nested_schema += indent + "}"
             return nested_schema
-        elif "enum" in param:
+        if "enum" in param:
             # Enum type
             return " | ".join([f'"{enum_value}"' for enum_value in param["enum"]])
-        else:
-            # Simple type
-            return param.get("type", "any")
+        # Simple type
+        return param.get("type", "any")
 
     def generate_shared_definitions(shared_defs, indent_level: int) -> str:
         indent = "  " * indent_level
@@ -1478,11 +1455,11 @@ def generate_schema_from_functions(functions, namespace="functions") -> str:
         return schema
 
     def prepare_messages_for_inference(
-        messages: List[llama_types.ChatCompletionRequestMessage],
-        functions: Optional[List[llama_types.ChatCompletionFunctions]] = None,
-        tools: Optional[List[llama_types.ChatCompletionTool]] = None,
+        messages: list[llama_types.ChatCompletionRequestMessage],
+        functions: list[llama_types.ChatCompletionFunctions] | None = None,
+        tools: list[llama_types.ChatCompletionTool] | None = None,
     ):
-        all_messages: List[llama_types.ChatCompletionRequestMessage] = []
+        all_messages: list[llama_types.ChatCompletionRequestMessage] = []
         if functions is not None:
             all_messages.append(
                 llama_types.ChatCompletionRequestSystemMessage(
@@ -1531,26 +1508,24 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
             if msg["role"] == "system":
                 return f"system:\n{msg['content']}\n"
 
-            elif msg["role"] == "function" and "name" in msg:
+            if msg["role"] == "function" and "name" in msg:
                 return f"function name={msg['name']}:\n{msg['content']}\n"
-            elif msg["role"] == "function" and "function_call" in msg:
+            if msg["role"] == "function" and "function_call" in msg:
                 return f"function name={msg['function_call']['name']}:\n{msg['function_call']['arguments']}\n"
-            elif msg["role"] == "tool":
+            if msg["role"] == "tool":
                 if msg["content"] is not None:
                     return f"function name={msg['tool_call_id']}:\n{msg['content']}\n"
-                else:
-                    return f"function name={msg['tool_call_id']}\n"
-            elif msg["role"] == "user":
+                return f"function name={msg['tool_call_id']}\n"
+            if msg["role"] == "user":
                 if msg["content"] is None:
                     return "user:\n</s></s>\n"
-                else:
-                    return f"user:\n</s>{msg['content']}</s>\n"
-            elif msg["role"] == "assistant":
+                return f"user:\n</s>{msg['content']}</s>\n"
+            if msg["role"] == "assistant":
                 if msg["content"] is not None and "function_call" in msg:
                     return f"assistant:\n{msg['content']}\nassistant to={msg['function_call']['name']}:\n{msg['function_call']['arguments']}</s>\n"
-                elif "function_call" in msg:
+                if "function_call" in msg:
                     return f"assistant to={msg['function_call']['name']}:\n{msg['function_call']['arguments']}</s>\n"
-                elif "tool_calls" in msg and len(msg["tool_calls"]) > 0:
+                if "tool_calls" in msg and len(msg["tool_calls"]) > 0:
                     for tool_call in msg[
                         "tool_calls"
                     ]:  # NOTE: probably doesn't work with the functionary model
@@ -1724,20 +1699,20 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
 @register_chat_completion_handler("functionary-v2")
 def functionary_v1_v2_chat_handler(
     llama: llama.Llama,
-    messages: List[llama_types.ChatCompletionRequestMessage],
-    functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
-    function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
-    tools: Optional[List[llama_types.ChatCompletionTool]] = None,
-    tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
+    messages: list[llama_types.ChatCompletionRequestMessage],
+    functions: list[llama_types.ChatCompletionFunction] | None = None,
+    function_call: llama_types.ChatCompletionRequestFunctionCall | None = None,
+    tools: list[llama_types.ChatCompletionTool] | None = None,
+    tool_choice: llama_types.ChatCompletionToolChoiceOption | None = None,
     temperature: float = 0.2,
     top_p: float = 0.95,
     top_k: int = 40,
     min_p: float = 0.05,
     typical_p: float = 1.0,
     stream: bool = False,
-    stop: Optional[Union[str, List[str]]] = [],
-    response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None,
-    max_tokens: Optional[int] = None,
+    stop: str | list[str] | None = [],
+    response_format: llama_types.ChatCompletionRequestResponseFormat | None = None,
+    max_tokens: int | None = None,
     presence_penalty: float = 0.0,
     frequency_penalty: float = 0.0,
     repeat_penalty: float = 1.1,
@@ -1745,11 +1720,11 @@ def functionary_v1_v2_chat_handler(
     mirostat_mode: int = 0,
     mirostat_tau: float = 5.0,
     mirostat_eta: float = 0.1,
-    model: Optional[str] = None,
-    logits_processor: Optional[llama.LogitsProcessorList] = None,
-    grammar: Optional[llama.LlamaGrammar] = None,
+    model: str | None = None,
+    logits_processor: llama.LogitsProcessorList | None = None,
+    grammar: llama.LlamaGrammar | None = None,
     **kwargs,  # type: ignore
-) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
+) -> llama_types.ChatCompletion | Iterator[llama_types.ChatCompletionChunk]:
     SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
 
     tokenizer = llama.tokenizer_
@@ -1774,7 +1749,7 @@ def functionary_v1_v2_chat_handler(
         CONTENT_TOKEN = "<|content|>"
 
     def generate_type_definition(
-        param: Dict[str, llama_types.JsonType], indent_level: int, shared_defs,
+        param: dict[str, llama_types.JsonType], indent_level: int, shared_defs,
     ) -> str:
         indent = "  " * indent_level
         if "$ref" in param:
@@ -1783,11 +1758,11 @@ def generate_type_definition(
                 -1
             ]  # Extract the type name from the reference
             return ref_name
-        elif param.get("type") == "array":
+        if param.get("type") == "array":
             items = param.get("items", {})
             item_type = generate_type_definition(items, indent_level + 1, shared_defs)
             return f"Array<{item_type}>"
-        elif param.get("type") == "object":
+        if param.get("type") == "object":
             properties = param.get("properties", {})
             nested_schema = "{\n"
             for nested_param_name, nested_param in properties.items():
@@ -1799,12 +1774,11 @@ def generate_type_definition(
                 )
             nested_schema += indent + "}"
             return nested_schema
-        elif "enum" in param:
+        if "enum" in param:
             # Enum type
             return " | ".join([f'"{enum_value}"' for enum_value in param["enum"]])
-        else:
-            # Simple type
-            return param.get("type", "any")
+        # Simple type
+        return param.get("type", "any")
 
     def generate_shared_definitions(shared_defs, indent_level: int) -> str:
         indent = "  " * indent_level
@@ -1858,14 +1832,14 @@ def generate_schema_from_functions(functions, namespace="functions") -> str:
         return schema
 
     def prepare_messages_for_inference(
-        messages: List[llama_types.ChatCompletionRequestMessage],
+        messages: list[llama_types.ChatCompletionRequestMessage],
         tokenizer: AutoTokenizer,
         version: Literal["v1", "v2"],
-        functions: Optional[List[llama_types.ChatCompletionFunctions]] = None,
-        tools: Optional[List[llama_types.ChatCompletionTool]] = None,
-        tool_choice: Union[Dict, str] = "auto",
+        functions: list[llama_types.ChatCompletionFunctions] | None = None,
+        tools: list[llama_types.ChatCompletionTool] | None = None,
+        tool_choice: dict | str = "auto",
     ):
-        all_messages: List[llama_types.ChatCompletionRequestMessage] = []
+        all_messages: list[llama_types.ChatCompletionRequestMessage] = []
         if tool_choice == "none":
             all_messages.append(
                 llama_types.ChatCompletionRequestSystemMessage(
@@ -2428,199 +2402,192 @@ def generate_streaming(tools, functions, function_call, prompt):
         return generate_streaming(
             tools=tools, functions=functions, function_call=function_call, prompt=prompt,
         )
-    else:
-        if version == "v1":
-            # If no or "auto" tool_choice/function_call
-            if isinstance(function_call, str) and function_call == "auto":
-                stops = ["\n", END_ASSISTANT_TOKEN]
-            # If tool_choice/function_call is provided
-            elif isinstance(function_call, dict):
-                prompt += f"{START_FUNCTION_CALL_TOKEN}{function_call['name']}:\n"
-                stops = END_FUNCTION_CALL_TOKEN
-                function_call = function_call["name"]
-                function_calls.append(function_call)
-                grammar = get_grammar(function_call)
-            else:
-                prompt = prompt
-                stops = ["\n", END_ASSISTANT_TOKEN]
-
-            completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
-            completion_text = completion["choices"][0]["text"]
-            completion_tokens += completion["usage"]["completion_tokens"]
-
-            # If the generation does not involve a function call
-            if (
-                START_FUNCTION_CALL_TOKEN not in prompt
-                and START_FUNCTION_CALL_TOKEN not in completion_text
-            ):
-                completion["usage"]["completion_tokens"] = completion_tokens
-                return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
-            # If the generation involves a function call in completion, generate the parameters
-            elif (
-                START_FUNCTION_CALL_TOKEN not in prompt
-                and START_FUNCTION_CALL_TOKEN in completion_text
-            ):
-                prompt += (
-                    completion_text.replace(
-                        f"{START_FUNCTION_CALL_TOKEN} ", START_FUNCTION_CALL_TOKEN,
-                    )
-                    + "\n"
-                )
-                function_calls.append(
-                    completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip(),
-                )
-                grammar = get_grammar(function_calls[-1])
-                completion = create_completion(
-                    prompt=prompt, stop=END_FUNCTION_CALL_TOKEN, grammar=grammar,
-                )
-                completion_tokens += completion["usage"]["completion_tokens"]
-                function_bodies.append(completion["choices"][0]["text"].strip())
-            # If the prompt involves a function call, just append generated parameters to function_bodies
-            else:
-                function_bodies.append(completion_text.strip())
+    if version == "v1":
+        # If no or "auto" tool_choice/function_call
+        if isinstance(function_call, str) and function_call == "auto":
+            stops = ["\n", END_ASSISTANT_TOKEN]
+        # If tool_choice/function_call is provided
         elif isinstance(function_call, dict):
-            prompt += f"{function_call['name']}\n{CONTENT_TOKEN}"
+            prompt += f"{START_FUNCTION_CALL_TOKEN}{function_call['name']}:\n"
+            stops = END_FUNCTION_CALL_TOKEN
             function_call = function_call["name"]
             function_calls.append(function_call)
             grammar = get_grammar(function_call)
-            stops = [STOP_TOKEN, FROM_TOKEN]
+        else:
+            prompt = prompt
+            stops = ["\n", END_ASSISTANT_TOKEN]
+
+        completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
+        completion_text = completion["choices"][0]["text"]
+        completion_tokens += completion["usage"]["completion_tokens"]
+
+        # If the generation does not involve a function call
+        if (
+            START_FUNCTION_CALL_TOKEN not in prompt
+            and START_FUNCTION_CALL_TOKEN not in completion_text
+        ):
+            completion["usage"]["completion_tokens"] = completion_tokens
+            return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
+        # If the generation involves a function call in completion, generate the parameters
+        if (
+            START_FUNCTION_CALL_TOKEN not in prompt
+            and START_FUNCTION_CALL_TOKEN in completion_text
+        ):
+            prompt += (
+                completion_text.replace(
+                    f"{START_FUNCTION_CALL_TOKEN} ", START_FUNCTION_CALL_TOKEN,
+                )
+                + "\n"
+            )
+            function_calls.append(
+                completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip(),
+            )
+            grammar = get_grammar(function_calls[-1])
+            completion = create_completion(
+                prompt=prompt, stop=END_FUNCTION_CALL_TOKEN, grammar=grammar,
+            )
+            completion_tokens += completion["usage"]["completion_tokens"]
+            function_bodies.append(completion["choices"][0]["text"].strip())
+        # If the prompt involves a function call, just append generated parameters to function_bodies
+        else:
+            function_bodies.append(completion_text.strip())
+    elif isinstance(function_call, dict):
+        prompt += f"{function_call['name']}\n{CONTENT_TOKEN}"
+        function_call = function_call["name"]
+        function_calls.append(function_call)
+        grammar = get_grammar(function_call)
+        stops = [STOP_TOKEN, FROM_TOKEN]
+        completion = create_completion(
+            prompt=prompt, stop=stops, grammar=grammar,
+        )
+        completion_text = completion["choices"][0]["text"]
+        completion_tokens += completion["usage"]["completion_tokens"]
+        function_bodies.append(completion_text.strip())
+    # If "auto" or no tool_choice/function_call
+    elif isinstance(function_call, str) and function_call == "auto":
+        while True:
+            # Generate function name first
+            grammar = None
+            stops = CONTENT_TOKEN
             completion = create_completion(
                 prompt=prompt, stop=stops, grammar=grammar,
             )
             completion_text = completion["choices"][0]["text"]
             completion_tokens += completion["usage"]["completion_tokens"]
-            function_bodies.append(completion_text.strip())
-        # If "auto" or no tool_choice/function_call
-        elif isinstance(function_call, str) and function_call == "auto":
-            while True:
-                # Generate function name first
+            function_name = completion_text.strip()
+            if function_name == "all":
+                prompt += "all\n<|content|>"
+            else:
+                function_call = completion_text.strip()
+                prompt += f"{function_call}\n<|content|>"
+                function_calls.append(function_call)
+                grammar = get_grammar(function_call)
+            # Generate content
+            stops = [RECIPIENT_TOKEN, STOP_TOKEN]
+            completion = create_completion(
+                prompt=prompt, stop=stops, grammar=grammar,
+            )
+            completion_text = completion["choices"][0]["text"]
+            completion_tokens += completion["usage"]["completion_tokens"]
+            if function_name == "all":
+                if completion_text.endswith("\n<|from|>assistant\n"):
+                    content += completion_text[: -len("\n<|from|>assistant\n")]
+                if completion_text.endswith("\n<|from|> assistant\n"):
+                    content += completion_text[-len("\n<|from|> assistant\n")]
+                else:
+                    content += completion_text
+                content = content.lstrip()
+                # Check whether the model wants to generate another turn
+                if (
+                    "<|from|> assistant" in completion_text
+                    or "<|from|>assistant" in completion_text
+                ):
+                    if completion_text.endswith("\n<|from|>assistant\n"):
+                        cleaned_completion_text = completion_text[
+                            : -len("\n<|from|>assistant\n")
+                        ].strip()
+                    elif completion_text.endswith("\n<|from|> assistant\n"):
+                        cleaned_completion_text = completion_text[
+                            -len("\n<|from|> assistant\n")
+                        ].strip()
+                    else:
+                        cleaned_completion_text = completion_text.strip()
+                    prompt += f"{cleaned_completion_text}\n<|from|>assistant\n<|recipient|>"
+                else:
+                    break
+            else:
+                function_bodies.append(completion_text.strip())
+                # Check whether the model wants to generate another turn
+                prompt += completion_text.strip()
                 grammar = None
-                stops = CONTENT_TOKEN
                 completion = create_completion(
                     prompt=prompt, stop=stops, grammar=grammar,
                 )
-                completion_text = completion["choices"][0]["text"]
                 completion_tokens += completion["usage"]["completion_tokens"]
-                function_name = completion_text.strip()
-                if function_name == "all":
-                    prompt += "all\n<|content|>"
+                if (
+                    "<|from|> assistant" in completion["choices"][0]["text"]
+                    or "<|from|>assistant" in completion["choices"][0]["text"]
+                ):
+                    prompt += "\n<|from|>assistant\n<|recipient|>"
                 else:
-                    function_call = completion_text.strip()
-                    prompt += f"{function_call}\n<|content|>"
-                    function_calls.append(function_call)
-                    grammar = get_grammar(function_call)
-                # Generate content
-                stops = [RECIPIENT_TOKEN, STOP_TOKEN]
-                completion = create_completion(
-                    prompt=prompt, stop=stops, grammar=grammar,
-                )
-                completion_text = completion["choices"][0]["text"]
-                completion_tokens += completion["usage"]["completion_tokens"]
-                if function_name == "all":
-                    if completion_text.endswith("\n<|from|>assistant\n"):
-                        content += completion_text[: -len("\n<|from|>assistant\n")]
-                    if completion_text.endswith("\n<|from|> assistant\n"):
-                        content += completion_text[-len("\n<|from|> assistant\n")]
-                    else:
-                        content += completion_text
-                    content = content.lstrip()
-                    # Check whether the model wants to generate another turn
-                    if (
-                        "<|from|> assistant" in completion_text
-                        or "<|from|>assistant" in completion_text
-                    ):
-                        if completion_text.endswith("\n<|from|>assistant\n"):
-                            cleaned_completion_text = completion_text[
-                                : -len("\n<|from|>assistant\n")
-                            ].strip()
-                        elif completion_text.endswith("\n<|from|> assistant\n"):
-                            cleaned_completion_text = completion_text[
-                                -len("\n<|from|> assistant\n")
-                            ].strip()
-                        else:
-                            cleaned_completion_text = completion_text.strip()
-                        prompt += f"{cleaned_completion_text}\n<|from|>assistant\n<|recipient|>"
-                    else:
-                        break
-                else:
-                    function_bodies.append(completion_text.strip())
-                    # Check whether the model wants to generate another turn
-                    prompt += completion_text.strip()
-                    grammar = None
-                    completion = create_completion(
-                        prompt=prompt, stop=stops, grammar=grammar,
-                    )
-                    completion_tokens += completion["usage"]["completion_tokens"]
-                    if (
-                        "<|from|> assistant" in completion["choices"][0]["text"]
-                        or "<|from|>assistant" in completion["choices"][0]["text"]
-                    ):
-                        prompt += "\n<|from|>assistant\n<|recipient|>"
-                    else:
-                        break
+                    break
 
-        assert "usage" in completion
-        assert len(function_calls) == len(function_bodies)
+    assert "usage" in completion
+    assert len(function_calls) == len(function_bodies)
 
-        tool_calls: List[llama_types.ChatCompletionMessageToolCall] = []
-        for function_call, function_body in zip(function_calls, function_bodies):
-            tool_calls.append(
-                {
-                    "id": "call_"
-                    + "".join(
-                        [
-                            random.choice(string.ascii_letters + string.digits)
-                            for _ in range(24)
-                        ],
-                    ),
-                    "type": "function",
-                    "function": {
-                        "name": function_call,
-                        "arguments": function_body,
-                    },
+    tool_calls: list[llama_types.ChatCompletionMessageToolCall] = []
+    for function_call, function_body in zip(function_calls, function_bodies, strict=False):
+        tool_calls.append(
+            {
+                "id": "call_"
+                + "".join(
+                    [
+                        random.choice(string.ascii_letters + string.digits)
+                        for _ in range(24)
+                    ],
+                ),
+                "type": "function",
+                "function": {
+                    "name": function_call,
+                    "arguments": function_body,
                 },
-            )
+            },
+        )
 
-        # TODO: support stream mode
-        function_call_dict: Union[
-            Dict[str, str],
-            Dict[
-                Literal["function_call"],
-                llama_types.ChatCompletionRequestAssistantMessageFunctionCall,
-            ],
-        ] = {}
-        if len(tool_calls) > 0:
-            if tools is not None:
-                function_call_dict["tool_calls"] = tool_calls
-            else:
-                function_call_dict["function_call"] = {
-                    "name": tool_calls[0]["function"]["name"],
-                    "arguments": tool_calls[0]["function"]["arguments"],
-                }
-        completion["usage"]["completion_tokens"] = completion_tokens
-        return llama_types.CreateChatCompletionResponse(
-            id="chat" + completion["id"],
-            object="chat.completion",
-            created=completion["created"],
-            model=completion["model"],
-            choices=[
-                {
-                    "index": 0,
-                    "logprobs": completion["choices"][0]["logprobs"],
-                    "message": {
-                        "role": "assistant",
-                        "content": None if content == "" else content,
-                        **function_call_dict,
-                    },
-                    "finish_reason": "tool_calls" if len(tool_calls) > 0 else "stop",
+    # TODO: support stream mode
+    function_call_dict: dict[str, str] | dict[Literal["function_call"], llama_types.ChatCompletionRequestAssistantMessageFunctionCall] = {}
+    if len(tool_calls) > 0:
+        if tools is not None:
+            function_call_dict["tool_calls"] = tool_calls
+        else:
+            function_call_dict["function_call"] = {
+                "name": tool_calls[0]["function"]["name"],
+                "arguments": tool_calls[0]["function"]["arguments"],
+            }
+    completion["usage"]["completion_tokens"] = completion_tokens
+    return llama_types.CreateChatCompletionResponse(
+        id="chat" + completion["id"],
+        object="chat.completion",
+        created=completion["created"],
+        model=completion["model"],
+        choices=[
+            {
+                "index": 0,
+                "logprobs": completion["choices"][0]["logprobs"],
+                "message": {
+                    "role": "assistant",
+                    "content": None if content == "" else content,
+                    **function_call_dict,
                 },
-            ],
-            usage=completion["usage"],
-        )
+                "finish_reason": "tool_calls" if len(tool_calls) > 0 else "stop",
+            },
+        ],
+        usage=completion["usage"],
+    )
 
 
 class Llava15ChatHandler:
-    DEFAULT_SYSTEM_MESSAGE: Optional[str] = (
+    DEFAULT_SYSTEM_MESSAGE: str | None = (
         "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions."
     )
 
@@ -2667,10 +2634,8 @@ def __init__(self, clip_model_path: str, verbose: bool = True):
 
         self._llava_cpp = llava_cpp  # TODO: Fix
         self._exit_stack = ExitStack()
-        self._last_image_embed: Optional[
-            llava_cpp.CtypesPointer[llava_cpp.llava_image_embed]
-        ] = None
-        self._last_image_hash: Optional[int] = None
+        self._last_image_embed: llava_cpp.CtypesPointer[llava_cpp.llava_image_embed] | None = None
+        self._last_image_hash: int | None = None
 
         if not os.path.exists(clip_model_path):
             raise ValueError(f"Clip model path does not exist: {clip_model_path}")
@@ -2704,23 +2669,21 @@ def __call__(
         self,
         *,
         llama: llama.Llama,
-        messages: List[llama_types.ChatCompletionRequestMessage],
-        functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
-        function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
-        tools: Optional[List[llama_types.ChatCompletionTool]] = None,
-        tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
+        messages: list[llama_types.ChatCompletionRequestMessage],
+        functions: list[llama_types.ChatCompletionFunction] | None = None,
+        function_call: llama_types.ChatCompletionRequestFunctionCall | None = None,
+        tools: list[llama_types.ChatCompletionTool] | None = None,
+        tool_choice: llama_types.ChatCompletionToolChoiceOption | None = None,
         temperature: float = 0.2,
         top_p: float = 0.95,
         top_k: int = 40,
         min_p: float = 0.05,
         typical_p: float = 1.0,
         stream: bool = False,
-        stop: Optional[Union[str, List[str]]] = [],
-        seed: Optional[int] = None,
-        response_format: Optional[
-            llama_types.ChatCompletionRequestResponseFormat
-        ] = None,
-        max_tokens: Optional[int] = None,
+        stop: str | list[str] | None = [],
+        seed: int | None = None,
+        response_format: llama_types.ChatCompletionRequestResponseFormat | None = None,
+        max_tokens: int | None = None,
         presence_penalty: float = 0.0,
         frequency_penalty: float = 0.0,
         repeat_penalty: float = 1.1,
@@ -2728,17 +2691,14 @@ def __call__(
         mirostat_mode: int = 0,
         mirostat_tau: float = 5.0,
         mirostat_eta: float = 0.1,
-        model: Optional[str] = None,
-        logits_processor: Optional[llama.LogitsProcessorList] = None,
-        grammar: Optional[llama.LlamaGrammar] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        top_logprobs: Optional[int] = None,
+        model: str | None = None,
+        logits_processor: llama.LogitsProcessorList | None = None,
+        grammar: llama.LlamaGrammar | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        top_logprobs: int | None = None,
         **kwargs,  # type: ignore
-    ) -> Union[
-        llama_types.CreateChatCompletionResponse,
-        Iterator[llama_types.CreateChatCompletionStreamResponse],
-    ]:
+    ) -> llama_types.CreateChatCompletionResponse | Iterator[llama_types.CreateChatCompletionStreamResponse]:
         assert self.clip_ctx is not None
 
         system_prompt = _get_system_message(messages)
@@ -2912,16 +2872,15 @@ def _load_image(image_url: str) -> bytes:
 
             image_bytes = base64.b64decode(image_url.split(",")[1])
             return image_bytes
-        else:
-            import urllib.request
+        import urllib.request
 
-            with urllib.request.urlopen(image_url) as f:
-                image_bytes = f.read()
-                return image_bytes
+        with urllib.request.urlopen(image_url) as f:
+            image_bytes = f.read()
+            return image_bytes
 
     @staticmethod
-    def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]):
-        image_urls: List[str] = []
+    def get_image_urls(messages: list[llama_types.ChatCompletionRequestMessage]):
+        image_urls: list[str] = []
         for message in messages:
             if message["role"] == "user":
                 if message["content"] is None:
@@ -2939,15 +2898,15 @@ def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]):
         return image_urls
 
     @staticmethod
-    def split_text_on_image_urls(text: str, image_urls: List[str]):
-        def find_first(s: str, substrs: List[str]):
+    def split_text_on_image_urls(text: str, image_urls: list[str]):
+        def find_first(s: str, substrs: list[str]):
             for i, substr in enumerate(substrs):
                 pos = s.find(substr)
                 if pos != -1:
                     return pos, i
             return None, None
 
-        split_text: List[Tuple[Literal["text", "image_url"], str]] = []
+        split_text: list[tuple[Literal["text", "image_url"], str]] = []
         remaining = text
         while remaining:
             # Find first image_url
@@ -2966,10 +2925,10 @@ def find_first(s: str, substrs: List[str]):
     def from_pretrained(
         cls,
         repo_id: str,
-        filename: Optional[str],
-        local_dir: Optional[Union[str, os.PathLike[str]]] = None,
-        local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
-        cache_dir: Optional[Union[str, os.PathLike[str]]] = None,
+        filename: str | None,
+        local_dir: str | os.PathLike[str] | None = None,
+        local_dir_use_symlinks: bool | Literal["auto"] = "auto",
+        cache_dir: str | os.PathLike[str] | None = None,
         **kwargs: Any,
     ) -> Llava15ChatHandler:
         import fnmatch
@@ -2994,7 +2953,7 @@ def from_pretrained(
         ]
 
         # split each file into repo_id, subfolder, filename
-        file_list: List[str] = []
+        file_list: list[str] = []
         for file in files:
             rel_path = Path(file).relative_to(repo_id)
             file_list.append(str(rel_path))
@@ -3023,9 +2982,9 @@ def from_pretrained(
             repo_id=repo_id,
             filename=filename,
             subfolder=subfolder,
-            local_dir=cast(Union[str, Path, None], local_dir),
+            local_dir=cast(str | Path | None, local_dir),
             local_dir_use_symlinks=local_dir_use_symlinks,
-            cache_dir=cast(Union[str, Path, None], cache_dir),
+            cache_dir=cast(str | Path | None, cache_dir),
         )
 
         if local_dir is None:
@@ -3035,7 +2994,7 @@ def from_pretrained(
                 subfolder=subfolder,
                 local_dir=local_dir,
                 local_dir_use_symlinks=local_dir_use_symlinks,
-                cache_dir=cast(Union[str, Path, None], cache_dir),
+                cache_dir=cast(str | Path | None, cache_dir),
                 local_files_only=True,
             )
         else:
@@ -3304,20 +3263,20 @@ class Llama3VisionAlphaChatHandler(Llava15ChatHandler):
 @register_chat_completion_handler("chatml-function-calling")
 def chatml_function_calling(
     llama: llama.Llama,
-    messages: List[llama_types.ChatCompletionRequestMessage],
-    functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
-    function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
-    tools: Optional[List[llama_types.ChatCompletionTool]] = None,
-    tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
+    messages: list[llama_types.ChatCompletionRequestMessage],
+    functions: list[llama_types.ChatCompletionFunction] | None = None,
+    function_call: llama_types.ChatCompletionRequestFunctionCall | None = None,
+    tools: list[llama_types.ChatCompletionTool] | None = None,
+    tool_choice: llama_types.ChatCompletionToolChoiceOption | None = None,
     temperature: float = 0.2,
     top_p: float = 0.95,
     top_k: int = 40,
     min_p: float = 0.05,
     typical_p: float = 1.0,
     stream: bool = False,
-    stop: Optional[Union[str, List[str]]] = [],
-    response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None,
-    max_tokens: Optional[int] = None,
+    stop: str | list[str] | None = [],
+    response_format: llama_types.ChatCompletionRequestResponseFormat | None = None,
+    max_tokens: int | None = None,
     presence_penalty: float = 0.0,
     frequency_penalty: float = 0.0,
     repeat_penalty: float = 1.1,
@@ -3325,16 +3284,13 @@ def chatml_function_calling(
     mirostat_mode: int = 0,
     mirostat_tau: float = 5.0,
     mirostat_eta: float = 0.1,
-    model: Optional[str] = None,
-    logits_processor: Optional[llama.LogitsProcessorList] = None,
-    grammar: Optional[llama.LlamaGrammar] = None,
-    logprobs: Optional[bool] = None,
-    top_logprobs: Optional[int] = None,
+    model: str | None = None,
+    logits_processor: llama.LogitsProcessorList | None = None,
+    grammar: llama.LlamaGrammar | None = None,
+    logprobs: bool | None = None,
+    top_logprobs: int | None = None,
     **kwargs,  # type: ignore
-) -> Union[
-    llama_types.CreateChatCompletionResponse,
-    Iterator[llama_types.CreateChatCompletionStreamResponse],
-]:
+) -> llama_types.CreateChatCompletionResponse | Iterator[llama_types.CreateChatCompletionStreamResponse]:
     function_calling_template = (
         "{% for message in messages %}"
         "<|im_start|>{{ message.role }}\n"
@@ -3594,8 +3550,8 @@ def chatml_function_calling(
     tool_name = text[len("functions.") :]
     tool = next((tool for tool in tools if tool["function"]["name"] == tool_name), None)
     if not stream:
-        completions: List[llama_types.CreateCompletionResponse] = []
-        completions_tool_name: List[str] = []
+        completions: list[llama_types.CreateCompletionResponse] = []
+        completions_tool_name: list[str] = []
         while tool is not None:
             prompt += f"functions.{tool_name}:\n"
             try:
@@ -3671,13 +3627,7 @@ def chatml_function_calling(
             )
 
         # Merge completions
-        function_call_dict: Union[
-            Dict[str, str],
-            Dict[
-                Literal["function_call"],
-                llama_types.ChatCompletionRequestAssistantMessageFunctionCall,
-            ],
-        ] = (
+        function_call_dict: dict[str, str] | dict[Literal["function_call"], llama_types.ChatCompletionRequestAssistantMessageFunctionCall] = (
             {
                 "function_call": {
                     "name": tool_name,
@@ -3714,7 +3664,7 @@ def chatml_function_calling(
                                 },
                             }
                             for i, (tool_name, completion) in enumerate(
-                                zip(completions_tool_name, completions),
+                                zip(completions_tool_name, completions, strict=False),
                             )
                         ],
                         **function_call_dict,
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 7d1fda3f1..40d91874e 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -5,20 +5,19 @@
 import os
 import pathlib
 import sys
+from collections.abc import Callable
 from typing import (
     TYPE_CHECKING,
     Any,
-    Callable,
     Generic,
     List,
     NewType,
     Optional,
+    TypeAlias,
     TypeVar,
     Union,
 )
 
-from typing_extensions import TypeAlias
-
 
 # Load the library
 def _load_shared_library(lib_base_name: str):
@@ -26,7 +25,7 @@ def _load_shared_library(lib_base_name: str):
     _base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib"
     # Searching for the library in the current directory under the name "libllama" (default name
     # for llamacpp) and "llama" (default name for this repo)
-    _lib_paths: List[pathlib.Path] = []
+    _lib_paths: list[pathlib.Path] = []
     # Determine the file extension based on the platform
     if sys.platform.startswith("linux") or sys.platform.startswith("freebsd"):
         _lib_paths += [
@@ -108,9 +107,7 @@ def _load_shared_library(lib_base_name: str):
     class CtypesRef(Generic[CtypesCData]):
         pass
 
-    CtypesPointerOrRef: TypeAlias = Union[
-        CtypesPointer[CtypesCData], CtypesRef[CtypesCData],
-    ]
+    CtypesPointerOrRef: TypeAlias = CtypesPointer[CtypesCData] | CtypesRef[CtypesCData]
 
     CtypesFuncPointer: TypeAlias = ctypes._FuncPointer  # type: ignore
 
@@ -119,7 +116,7 @@ class CtypesRef(Generic[CtypesCData]):
 
 def ctypes_function_for_shared_library(lib: ctypes.CDLL):
     def ctypes_function(
-        name: str, argtypes: List[Any], restype: Any, enabled: bool = True,
+        name: str, argtypes: list[Any], restype: Any, enabled: bool = True,
     ):
         def decorator(f: F) -> F:
             if enabled:
@@ -139,7 +136,7 @@ def decorator(f: F) -> F:
 ctypes_function = ctypes_function_for_shared_library(_lib)
 
 
-def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCData]:
+def byref(obj: CtypesCData, offset: int | None = None) -> CtypesRef[CtypesCData]:
     """Type-annotated version of ctypes.byref"""
 
 
@@ -698,7 +695,7 @@ class llama_model_kv_override(ctypes.Structure):
     if TYPE_CHECKING:
         tag: int
         key: bytes
-        value: Union[int, float, bool, bytes]
+        value: int | float | bool | bytes
 
 
 # struct llama_model_params {
@@ -1209,7 +1206,7 @@ def llama_backend_free():
 )
 def llama_load_model_from_file(
     path_model: bytes, params: llama_model_params, /,
-) -> Optional[llama_model_p]:
+) -> llama_model_p | None:
     ...
 
 
@@ -1233,7 +1230,7 @@ def llama_free_model(model: llama_model_p, /):
 )
 def llama_new_context_with_model(
     model: llama_model_p, params: llama_context_params, /,
-) -> Optional[llama_context_p]:
+) -> llama_context_p | None:
     ...
 
 
@@ -1284,7 +1281,7 @@ def llama_supports_gpu_offload() -> bool:
 
 # LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
 @ctypes_function("llama_get_model", [llama_context_p_ctypes], llama_model_p_ctypes)
-def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]:
+def llama_get_model(ctx: llama_context_p, /) -> llama_model_p | None:
     ...
 
 
@@ -1381,7 +1378,7 @@ def llama_rope_freq_scale_train(model: llama_model_p, /) -> float:
 )
 def llama_model_meta_val_str(
     model: llama_model_p,
-    key: Union[ctypes.c_char_p, bytes],
+    key: ctypes.c_char_p | bytes,
     buf: bytes,
     buf_size: int,
     /,
@@ -1410,8 +1407,8 @@ def llama_model_meta_count(model: llama_model_p, /) -> int:
 )
 def llama_model_meta_key_by_index(
     model: llama_model_p,
-    i: Union[ctypes.c_int, int],
-    buf: Union[bytes, CtypesArray[ctypes.c_char]],
+    i: ctypes.c_int | int,
+    buf: bytes | CtypesArray[ctypes.c_char],
     buf_size: int,
     /,
 ) -> int:
@@ -1432,8 +1429,8 @@ def llama_model_meta_key_by_index(
 )
 def llama_model_meta_val_str_by_index(
     model: llama_model_p,
-    i: Union[ctypes.c_int, int],
-    buf: Union[bytes, CtypesArray[ctypes.c_char]],
+    i: ctypes.c_int | int,
+    buf: bytes | CtypesArray[ctypes.c_char],
     buf_size: int,
     /,
 ) -> int:
@@ -1449,8 +1446,8 @@ def llama_model_meta_val_str_by_index(
 )
 def llama_model_desc(
     model: llama_model_p,
-    buf: Union[bytes, CtypesArray[ctypes.c_char]],
-    buf_size: Union[ctypes.c_size_t, int],
+    buf: bytes | CtypesArray[ctypes.c_char],
+    buf_size: ctypes.c_size_t | int,
     /,
 ) -> int:
     """Get a string describing the model type"""
@@ -1476,7 +1473,7 @@ def llama_model_n_params(model: llama_model_p, /) -> int:
     "llama_get_model_tensor", [llama_model_p_ctypes, ctypes.c_char_p], ctypes.c_void_p,
 )
 def llama_get_model_tensor(
-    model: llama_model_p, name: Union[ctypes.c_char_p, bytes], /,
+    model: llama_model_p, name: ctypes.c_char_p | bytes, /,
 ) -> ctypes.c_void_p:
     """Get a llama model tensor"""
 
@@ -1542,7 +1539,7 @@ def llama_model_quantize(
 )
 def llama_lora_adapter_init(
     model: llama_model_p, path_lora: bytes, /,
-) -> Optional[llama_lora_adapter_p]:
+) -> llama_lora_adapter_p | None:
     """Load a LoRA adapter from file
     The loaded adapter will be associated to the given model, and will be free when the model is deleted
     """
@@ -1740,7 +1737,7 @@ class llama_kv_cache_view(ctypes.Structure):
     llama_kv_cache_view,
 )
 def llama_kv_cache_view_init(
-    ctx: llama_context_p, n_seq_max: Union[ctypes.c_int32, int], /,
+    ctx: llama_context_p, n_seq_max: ctypes.c_int32 | int, /,
 ) -> llama_kv_cache_view:
     """Create an empty KV cache view. (use only for debugging purposes)"""
 
@@ -1812,9 +1809,9 @@ def llama_kv_cache_clear(ctx: llama_context_p, /):
 )
 def llama_kv_cache_seq_rm(
     ctx: llama_context_p,
-    seq_id: Union[llama_seq_id, int],
-    p0: Union[llama_pos, int],
-    p1: Union[llama_pos, int],
+    seq_id: llama_seq_id | int,
+    p0: llama_pos | int,
+    p1: llama_pos | int,
     /,
 ) -> bool:
     """Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
@@ -1849,10 +1846,10 @@ def llama_kv_cache_seq_rm(
 )
 def llama_kv_cache_seq_cp(
     ctx: llama_context_p,
-    seq_id_src: Union[llama_seq_id, int],
-    seq_id_dst: Union[llama_seq_id, int],
-    p0: Union[llama_pos, int],
-    p1: Union[llama_pos, int],
+    seq_id_src: llama_seq_id | int,
+    seq_id_dst: llama_seq_id | int,
+    p0: llama_pos | int,
+    p1: llama_pos | int,
     /,
 ):
     """Copy all tokens that belong to the specified sequence to another sequence
@@ -1868,7 +1865,7 @@ def llama_kv_cache_seq_cp(
 @ctypes_function(
     "llama_kv_cache_seq_keep", [llama_context_p_ctypes, llama_seq_id], None,
 )
-def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /):
+def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: llama_seq_id | int, /):
     """Removes all tokens that do not belong to the specified sequence"""
 
 
@@ -1897,10 +1894,10 @@ def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, in
 )
 def llama_kv_cache_seq_add(
     ctx: llama_context_p,
-    seq_id: Union[llama_seq_id, int],
-    p0: Union[llama_pos, int],
-    p1: Union[llama_pos, int],
-    delta: Union[llama_pos, int],
+    seq_id: llama_seq_id | int,
+    p0: llama_pos | int,
+    p1: llama_pos | int,
+    delta: llama_pos | int,
     /,
 ):
     """Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
@@ -1934,10 +1931,10 @@ def llama_kv_cache_seq_add(
 )
 def llama_kv_cache_seq_div(
     ctx: llama_context_p,
-    seq_id: Union[llama_seq_id, int],
-    p0: Union[llama_pos, int],
-    p1: Union[llama_pos, int],
-    d: Union[ctypes.c_int, int],
+    seq_id: llama_seq_id | int,
+    p0: llama_pos | int,
+    p1: llama_pos | int,
+    d: ctypes.c_int | int,
     /,
 ):
     """Integer division of the positions by factor of `d > 1`
@@ -2007,7 +2004,7 @@ def llama_get_state_size(ctx: llama_context_p, /) -> int:
 def llama_state_get_data(
     ctx: llama_context_p,
     dst: CtypesArray[ctypes.c_uint8],
-    size: Union[ctypes.c_size_t, int],
+    size: ctypes.c_size_t | int,
     /,
 ) -> int:
     """Copies the state to the specified destination address.
@@ -2049,7 +2046,7 @@ def llama_copy_state_data(
 def llama_state_set_data(
     ctx: llama_context_p,
     src: CtypesArray[ctypes.c_uint8],
-    size: Union[ctypes.c_size_t, int],
+    size: ctypes.c_size_t | int,
     /,
 ) -> int:
     """Set the state reading from the specified address
@@ -2093,7 +2090,7 @@ def llama_state_load_file(
     ctx: llama_context_p,
     path_session: bytes,
     tokens_out: CtypesArray[llama_token],
-    n_token_capacity: Union[ctypes.c_size_t, int],
+    n_token_capacity: ctypes.c_size_t | int,
     n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t],
     /,
 ) -> bool:
@@ -2122,7 +2119,7 @@ def llama_load_session_file(
     ctx: llama_context_p,
     path_session: bytes,
     tokens_out: CtypesArray[llama_token],
-    n_token_capacity: Union[ctypes.c_size_t, int],
+    n_token_capacity: ctypes.c_size_t | int,
     n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t],
     /,
 ) -> int:
@@ -2148,7 +2145,7 @@ def llama_state_save_file(
     ctx: llama_context_p,
     path_session: bytes,
     tokens: CtypesArray[llama_token],
-    n_token_count: Union[ctypes.c_size_t, int],
+    n_token_count: ctypes.c_size_t | int,
     /,
 ) -> bool:
     ...
@@ -2174,7 +2171,7 @@ def llama_save_session_file(
     ctx: llama_context_p,
     path_session: bytes,
     tokens: CtypesArray[llama_token],
-    n_token_count: Union[ctypes.c_size_t, int],
+    n_token_count: ctypes.c_size_t | int,
     /,
 ) -> int:
     ...
@@ -2212,7 +2209,7 @@ def llama_state_seq_get_size(ctx: llama_context_p, seq_id: llama_seq_id, /) -> i
 def llama_state_seq_get_data(
     ctx: llama_context_p,
     dst: CtypesArray[ctypes.c_uint8],
-    size: Union[ctypes.c_size_t, int],
+    size: ctypes.c_size_t | int,
     seq_id: llama_seq_id,
     /,
 ) -> int:
@@ -2241,7 +2238,7 @@ def llama_state_seq_get_data(
 def llama_state_seq_set_data(
     ctx: llama_context_p,
     src: CtypesArray[ctypes.c_uint8],
-    size: Union[ctypes.c_size_t, int],
+    size: ctypes.c_size_t | int,
     dest_seq_id: llama_seq_id,
     /,
 ) -> int:
@@ -2270,7 +2267,7 @@ def llama_state_seq_save_file(
     filepath: bytes,
     seq_id: llama_seq_id,
     tokens: CtypesArray[llama_token],
-    n_token_count: Union[ctypes.c_size_t, int],
+    n_token_count: ctypes.c_size_t | int,
     /,
 ) -> int:
     ...
@@ -2300,7 +2297,7 @@ def llama_state_seq_load_file(
     filepath: bytes,
     dest_seq_id: llama_seq_id,
     tokens_out: CtypesArray[llama_token],
-    n_token_capacity: Union[ctypes.c_size_t, int],
+    n_token_capacity: ctypes.c_size_t | int,
     n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t],
     /,
 ) -> int:
@@ -2333,8 +2330,8 @@ def llama_state_seq_load_file(
 )
 def llama_batch_get_one(
     tokens: CtypesArray[llama_token],
-    n_tokens: Union[ctypes.c_int, int],
-    pos_0: Union[llama_pos, int],
+    n_tokens: ctypes.c_int | int,
+    pos_0: llama_pos | int,
     seq_id: llama_seq_id,
     /,
 ) -> llama_batch:
@@ -2359,9 +2356,9 @@ def llama_batch_get_one(
     "llama_batch_init", [ctypes.c_int32, ctypes.c_int32, ctypes.c_int32], llama_batch,
 )
 def llama_batch_init(
-    n_tokens: Union[ctypes.c_int32, int],
-    embd: Union[ctypes.c_int32, int],
-    n_seq_max: Union[ctypes.c_int32, int],
+    n_tokens: ctypes.c_int32 | int,
+    embd: ctypes.c_int32 | int,
+    n_seq_max: ctypes.c_int32 | int,
     /,
 ) -> llama_batch:
     """Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
@@ -2425,8 +2422,8 @@ def llama_decode(ctx: llama_context_p, batch: llama_batch, /) -> int:
 )
 def llama_set_n_threads(
     ctx: llama_context_p,
-    n_threads: Union[ctypes.c_uint32, int],
-    n_threads_batch: Union[ctypes.c_uint32, int],
+    n_threads: ctypes.c_uint32 | int,
+    n_threads_batch: ctypes.c_uint32 | int,
     /,
 ):
     """Set the number of threads used for decoding
@@ -2525,7 +2522,7 @@ def llama_get_logits(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]:
     ctypes.POINTER(ctypes.c_float),
 )
 def llama_get_logits_ith(
-    ctx: llama_context_p, i: Union[ctypes.c_int32, int], /,
+    ctx: llama_context_p, i: ctypes.c_int32 | int, /,
 ) -> CtypesArray[ctypes.c_float]:
     """Logits for the ith token. Equivalent to:
     llama_get_logits(ctx) + i*n_vocab"""
@@ -2558,7 +2555,7 @@ def llama_get_embeddings(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]
     ctypes.POINTER(ctypes.c_float),
 )
 def llama_get_embeddings_ith(
-    ctx: llama_context_p, i: Union[ctypes.c_int32, int], /,
+    ctx: llama_context_p, i: ctypes.c_int32 | int, /,
 ) -> CtypesArray[ctypes.c_float]:
     """Get the embeddings for the ith sequence
     llama_get_embeddings(ctx) + i*n_embd"""
@@ -2574,7 +2571,7 @@ def llama_get_embeddings_ith(
     ctypes.POINTER(ctypes.c_float),
 )
 def llama_get_embeddings_seq(
-    ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /,
+    ctx: llama_context_p, seq_id: llama_seq_id | int, /,
 ) -> CtypesArray[ctypes.c_float]:
     """Get the embeddings for a sequence id
     Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
@@ -2591,7 +2588,7 @@ def llama_get_embeddings_seq(
     "llama_token_get_text", [llama_model_p_ctypes, llama_token], ctypes.c_char_p,
 )
 def llama_token_get_text(
-    model: llama_model_p, token: Union[llama_token, int], /,
+    model: llama_model_p, token: llama_token | int, /,
 ) -> bytes:
     ...
 
@@ -2601,7 +2598,7 @@ def llama_token_get_text(
     "llama_token_get_score", [llama_model_p_ctypes, llama_token], ctypes.c_float,
 )
 def llama_token_get_score(
-    model: llama_model_p, token: Union[llama_token, int], /,
+    model: llama_model_p, token: llama_token | int, /,
 ) -> float:
     ...
 
@@ -2611,7 +2608,7 @@ def llama_token_get_score(
     "llama_token_get_attr", [llama_model_p_ctypes, llama_token], ctypes.c_int,
 )
 def llama_token_get_attr(
-    model: llama_model_p, token: Union[llama_token, int], /,
+    model: llama_model_p, token: llama_token | int, /,
 ) -> int:
     ...
 
@@ -2621,7 +2618,7 @@ def llama_token_get_attr(
 @ctypes_function(
     "llama_token_is_eog", [llama_model_p_ctypes, llama_token], ctypes.c_bool,
 )
-def llama_token_is_eog(model: llama_model_p, token: Union[llama_token, int], /) -> bool:
+def llama_token_is_eog(model: llama_model_p, token: llama_token | int, /) -> bool:
     """Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)"""
 
 
@@ -2631,7 +2628,7 @@ def llama_token_is_eog(model: llama_model_p, token: Union[llama_token, int], /)
     "llama_token_is_control", [llama_model_p_ctypes, llama_token], ctypes.c_bool,
 )
 def llama_token_is_control(
-    model: llama_model_p, token: Union[llama_token, int], /,
+    model: llama_model_p, token: llama_token | int, /,
 ) -> bool:
     """Identify if Token Id is a control token or a render-able token"""
 
@@ -2744,11 +2741,11 @@ def llama_token_eot(model: llama_model_p, /) -> int:
 def llama_tokenize(
     model: llama_model_p,
     text: bytes,
-    text_len: Union[ctypes.c_int, int],
+    text_len: ctypes.c_int | int,
     tokens: CtypesArray[llama_token],
-    n_tokens_max: Union[ctypes.c_int, int],
-    add_special: Union[ctypes.c_bool, bool],
-    parse_special: Union[ctypes.c_bool, bool],
+    n_tokens_max: ctypes.c_int | int,
+    add_special: ctypes.c_bool | bool,
+    parse_special: ctypes.c_bool | bool,
     /,
 ) -> int:
     """Convert the provided text into tokens.
@@ -2794,11 +2791,11 @@ def llama_tokenize(
 )
 def llama_token_to_piece(
     model: llama_model_p,
-    token: Union[llama_token, int],
-    buf: Union[ctypes.c_char_p, bytes, CtypesArray[ctypes.c_char]],
-    length: Union[ctypes.c_int, int],
-    lstrip: Union[ctypes.c_int, int],
-    special: Union[ctypes.c_bool, bool],
+    token: llama_token | int,
+    buf: ctypes.c_char_p | bytes | CtypesArray[ctypes.c_char],
+    length: ctypes.c_int | int,
+    lstrip: ctypes.c_int | int,
+    special: ctypes.c_bool | bool,
     /,
 ) -> int:
     """Token Id -> Piece.
@@ -2845,11 +2842,11 @@ def llama_token_to_piece(
 def llama_detokenize(
     model: llama_model_p,
     tokens: CtypesArray[llama_token],
-    n_tokens: Union[ctypes.c_int, int],
+    n_tokens: ctypes.c_int | int,
     text: bytes,
-    text_len_max: Union[ctypes.c_int, int],
-    remove_special: Union[ctypes.c_bool, bool],
-    unparse_special: Union[ctypes.c_bool, bool],
+    text_len_max: ctypes.c_int | int,
+    remove_special: ctypes.c_bool | bool,
+    unparse_special: ctypes.c_bool | bool,
     /,
 ) -> int:
     """Convert the provided tokens into text (inverse of llama_tokenize()).
@@ -2929,10 +2926,10 @@ def llama_grammar_init(
     rules: CtypesArray[
         CtypesPointer[llama_grammar_element]
     ],  # NOTE: This might be wrong type sig
-    n_rules: Union[ctypes.c_size_t, int],
-    start_rule_index: Union[ctypes.c_size_t, int],
+    n_rules: ctypes.c_size_t | int,
+    start_rule_index: ctypes.c_size_t | int,
     /,
-) -> Optional[llama_grammar_p]:
+) -> llama_grammar_p | None:
     """Initialize a grammar from a set of rules."""
 
 
@@ -2973,9 +2970,7 @@ def llama_grammar_copy(grammar: llama_grammar_p, /) -> llama_grammar_p:
 def llama_grammar_sample(
     grammar: llama_grammar_p,
     ctx: llama_context_p,
-    candidates: Union[
-        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array],
-    ],
+    candidates: CtypesArray[llama_token_data_array] | CtypesPointerOrRef[llama_token_data_array],
     /,
 ):
     """Apply constraints from grammar"""
@@ -2993,9 +2988,7 @@ def llama_grammar_sample(
 )
 def llama_sample_grammar(
     ctx: llama_context_p,
-    candidates: Union[
-        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array],
-    ],
+    candidates: CtypesArray[llama_token_data_array] | CtypesPointerOrRef[llama_token_data_array],
     grammar,  # type: llama_grammar_p
     /,
 ):
@@ -3020,7 +3013,7 @@ def llama_sample_grammar(
 def llama_grammar_accept_token(
     grammar: llama_grammar_p,
     ctx: llama_context_p,
-    token: Union[llama_token, int],
+    token: llama_token | int,
     /,
 ):
     """Accepts the sampled token into the grammar"""
@@ -3038,7 +3031,7 @@ def llama_grammar_accept_token(
     [llama_context_p_ctypes, ctypes.c_uint32],
     None,
 )
-def llama_set_rng_seed(ctx: llama_context_p, seed: Union[ctypes.c_uint32, int], /):
+def llama_set_rng_seed(ctx: llama_context_p, seed: ctypes.c_uint32 | int, /):
     """Sets the current rng seed."""
 
 
@@ -3067,14 +3060,12 @@ def llama_set_rng_seed(ctx: llama_context_p, seed: Union[ctypes.c_uint32, int],
 )
 def llama_sample_repetition_penalties(
     ctx: llama_context_p,
-    candidates: Union[
-        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array],
-    ],
+    candidates: CtypesArray[llama_token_data_array] | CtypesPointerOrRef[llama_token_data_array],
     last_tokens_data: CtypesArray[llama_token],
-    penalty_last_n: Union[ctypes.c_size_t, int],
-    penalty_repeat: Union[ctypes.c_float, float],
-    penalty_freq: Union[ctypes.c_float, float],
-    penalty_present: Union[ctypes.c_float, float],
+    penalty_last_n: ctypes.c_size_t | int,
+    penalty_repeat: ctypes.c_float | float,
+    penalty_freq: ctypes.c_float | float,
+    penalty_present: ctypes.c_float | float,
     /,
 ):
     """Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
@@ -3105,7 +3096,7 @@ def llama_sample_apply_guidance(
     ctx: llama_context_p,
     logits: CtypesArray[ctypes.c_float],
     logits_guidance: CtypesArray[ctypes.c_float],
-    scale: Union[ctypes.c_float, float],
+    scale: ctypes.c_float | float,
     /,
 ):
     """Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806"""
@@ -3122,9 +3113,7 @@ def llama_sample_apply_guidance(
 )
 def llama_sample_softmax(
     ctx: llama_context_p,
-    candidates: Union[
-        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array],
-    ],
+    candidates: CtypesArray[llama_token_data_array] | CtypesPointerOrRef[llama_token_data_array],
     /,
 ):
     """Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits."""
@@ -3143,11 +3132,9 @@ def llama_sample_softmax(
 )
 def llama_sample_top_k(
     ctx: llama_context_p,
-    candidates: Union[
-        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array],
-    ],
-    k: Union[ctypes.c_int, int],
-    min_keep: Union[ctypes.c_size_t, int],
+    candidates: CtypesArray[llama_token_data_array] | CtypesPointerOrRef[llama_token_data_array],
+    k: ctypes.c_int | int,
+    min_keep: ctypes.c_size_t | int,
     /,
 ):
     """Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751"""
@@ -3166,11 +3153,9 @@ def llama_sample_top_k(
 )
 def llama_sample_top_p(
     ctx: llama_context_p,
-    candidates: Union[
-        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array],
-    ],
-    p: Union[ctypes.c_float, float],
-    min_keep: Union[ctypes.c_size_t, int],
+    candidates: CtypesArray[llama_token_data_array] | CtypesPointerOrRef[llama_token_data_array],
+    p: ctypes.c_float | float,
+    min_keep: ctypes.c_size_t | int,
     /,
 ):
     """Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751"""
@@ -3189,11 +3174,9 @@ def llama_sample_top_p(
 )
 def llama_sample_min_p(
     ctx: llama_context_p,
-    candidates: Union[
-        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array],
-    ],
-    p: Union[ctypes.c_float, float],
-    min_keep: Union[ctypes.c_size_t, int],
+    candidates: CtypesArray[llama_token_data_array] | CtypesPointerOrRef[llama_token_data_array],
+    p: ctypes.c_float | float,
+    min_keep: ctypes.c_size_t | int,
     /,
 ):
     """Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841"""
@@ -3212,11 +3195,9 @@ def llama_sample_min_p(
 )
 def llama_sample_tail_free(
     ctx: llama_context_p,
-    candidates: Union[
-        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array],
-    ],
-    z: Union[ctypes.c_float, float],
-    min_keep: Union[ctypes.c_size_t, int],
+    candidates: CtypesArray[llama_token_data_array] | CtypesPointerOrRef[llama_token_data_array],
+    z: ctypes.c_float | float,
+    min_keep: ctypes.c_size_t | int,
     /,
 ):
     """Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/."""
@@ -3235,11 +3216,9 @@ def llama_sample_tail_free(
 )
 def llama_sample_typical(
     ctx: llama_context_p,
-    candidates: Union[
-        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array],
-    ],
-    p: Union[ctypes.c_float, float],
-    min_keep: Union[ctypes.c_size_t, int],
+    candidates: CtypesArray[llama_token_data_array] | CtypesPointerOrRef[llama_token_data_array],
+    p: ctypes.c_float | float,
+    min_keep: ctypes.c_size_t | int,
     /,
 ):
     """Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666."""
@@ -3265,12 +3244,10 @@ def llama_sample_typical(
 )
 def llama_sample_entropy(
     ctx: llama_context_p,
-    candidates: Union[
-        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array],
-    ],
-    min_temp: Union[ctypes.c_float, float],
-    max_temp: Union[ctypes.c_float, float],
-    exponent_val: Union[ctypes.c_float, float],
+    candidates: CtypesArray[llama_token_data_array] | CtypesPointerOrRef[llama_token_data_array],
+    min_temp: ctypes.c_float | float,
+    max_temp: ctypes.c_float | float,
+    exponent_val: ctypes.c_float | float,
     /,
 ):
     """Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772."""
@@ -3287,10 +3264,8 @@ def llama_sample_entropy(
 )
 def llama_sample_temp(
     ctx: llama_context_p,
-    candidates: Union[
-        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array],
-    ],
-    temp: Union[ctypes.c_float, float],
+    candidates: CtypesArray[llama_token_data_array] | CtypesPointerOrRef[llama_token_data_array],
+    temp: ctypes.c_float | float,
     /,
 ):
     """Temperature sampling described in academic paper "Generating Long Sequences with Sparse Transformers" https://arxiv.org/abs/1904.10509
@@ -3328,12 +3303,10 @@ def llama_sample_temp(
 )
 def llama_sample_token_mirostat(
     ctx: llama_context_p,
-    candidates: Union[
-        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array],
-    ],
-    tau: Union[ctypes.c_float, float],
-    eta: Union[ctypes.c_float, float],
-    m: Union[ctypes.c_int, int],
+    candidates: CtypesArray[llama_token_data_array] | CtypesPointerOrRef[llama_token_data_array],
+    tau: ctypes.c_float | float,
+    eta: ctypes.c_float | float,
+    m: ctypes.c_int | int,
     mu: CtypesPointerOrRef[ctypes.c_float],
     /,
 ) -> int:
@@ -3372,11 +3345,9 @@ def llama_sample_token_mirostat(
 )
 def llama_sample_token_mirostat_v2(
     ctx: llama_context_p,
-    candidates: Union[
-        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array],
-    ],
-    tau: Union[ctypes.c_float, float],
-    eta: Union[ctypes.c_float, float],
+    candidates: CtypesArray[llama_token_data_array] | CtypesPointerOrRef[llama_token_data_array],
+    tau: ctypes.c_float | float,
+    eta: ctypes.c_float | float,
     mu: CtypesPointerOrRef[ctypes.c_float],
     /,
 ) -> int:
@@ -3402,9 +3373,7 @@ def llama_sample_token_mirostat_v2(
 )
 def llama_sample_token_greedy(
     ctx: llama_context_p,
-    candidates: Union[
-        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array],
-    ],
+    candidates: CtypesArray[llama_token_data_array] | CtypesPointerOrRef[llama_token_data_array],
     /,
 ) -> int:
     """Selects the token with the highest probability."""
@@ -3421,9 +3390,7 @@ def llama_sample_token_greedy(
 )
 def llama_sample_token(
     ctx: llama_context_p,
-    candidates: Union[
-        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array],
-    ],
+    candidates: CtypesArray[llama_token_data_array] | CtypesPointerOrRef[llama_token_data_array],
     /,
 ) -> int:
     """Randomly selects a token from the candidates based on their probabilities."""
@@ -3445,10 +3412,10 @@ def llama_sample_token(
 )
 def llama_split_path(
     split_path: bytes,
-    maxlen: Union[ctypes.c_size_t, int],
+    maxlen: ctypes.c_size_t | int,
     path_prefix: bytes,
-    split_no: Union[ctypes.c_int, int],
-    split_count: Union[ctypes.c_int, int],
+    split_no: ctypes.c_int | int,
+    split_count: ctypes.c_int | int,
     /,
 ) -> int:
     """Build a split GGUF final path for this chunk."""
@@ -3465,10 +3432,10 @@ def llama_split_path(
 )
 def llama_split_prefix(
     split_prefix: bytes,
-    maxlen: Union[ctypes.c_size_t, int],
+    maxlen: ctypes.c_size_t | int,
     split_path: bytes,
-    split_no: Union[ctypes.c_int, int],
-    split_count: Union[ctypes.c_int, int],
+    split_no: ctypes.c_int | int,
+    split_count: ctypes.c_int | int,
     /,
 ) -> int:
     """Extract the path prefix from the split_path if and only if the split_no and split_count match."""
@@ -3528,7 +3495,7 @@ def llama_print_system_info() -> bytes:
     None,
 )
 def llama_log_set(
-    log_callback: Optional[CtypesFuncPointer],
+    log_callback: CtypesFuncPointer | None,
     user_data: ctypes.c_void_p,
     /,
 ):
diff --git a/llama_cpp/llama_tokenizer.py b/llama_cpp/llama_tokenizer.py
index ca31cabd2..20cba796d 100644
--- a/llama_cpp/llama_tokenizer.py
+++ b/llama_cpp/llama_tokenizer.py
@@ -26,7 +26,7 @@ def tokenize(
 
     @abc.abstractmethod
     def detokenize(
-        self, tokens: List[int], prev_tokens: Optional[List[int]] = None,
+        self, tokens: List[int], prev_tokens: List[int] | None = None,
     ) -> bytes:
         """Detokenize the tokens into text.
 
@@ -47,7 +47,7 @@ def tokenize(
         return self._model.tokenize(text, add_bos=add_bos, special=special)
 
     def detokenize(
-        self, tokens: List[int], prev_tokens: Optional[List[int]] = None,
+        self, tokens: List[int], prev_tokens: List[int] | None = None,
     ) -> bytes:
         return self._model.detokenize(tokens)
 
@@ -78,7 +78,7 @@ def tokenize(
         )
 
     def detokenize(
-        self, tokens: List[int], prev_tokens: Optional[List[int]] = None,
+        self, tokens: List[int], prev_tokens: List[int] | None = None,
     ) -> bytes:
         if prev_tokens is not None:
             text = self.hf_tokenizer.decode(prev_tokens + tokens).encode(
diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py
index 84bde6ef3..9c9969b59 100644
--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@@ -7,14 +7,14 @@
 
 """
 
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Literal, NotRequired, Optional, Union
 
-from typing_extensions import Literal, NotRequired, TypedDict
+from typing_extensions import TypedDict
 
 # NOTE: Defining this correctly using annotations seems to break pydantic validation.
 #       This is a workaround until we can figure out how to do this correctly
 # JsonType = Union[None, int, str, bool, List["JsonType"], Dict[str, "JsonType"]]
-JsonType = Union[None, int, str, bool, List[Any], Dict[str, Any]]
+JsonType = Union[None, int, str, bool, list[Any], dict[str, Any]]
 
 
 class EmbeddingUsage(TypedDict):
@@ -25,28 +25,28 @@ class EmbeddingUsage(TypedDict):
 class Embedding(TypedDict):
     index: int
     object: str
-    embedding: Union[List[float], List[List[float]]]
+    embedding: list[float] | list[list[float]]
 
 
 class CreateEmbeddingResponse(TypedDict):
     object: Literal["list"]
     model: str
-    data: List[Embedding]
+    data: list[Embedding]
     usage: EmbeddingUsage
 
 
 class CompletionLogprobs(TypedDict):
-    text_offset: List[int]
-    token_logprobs: List[Optional[float]]
-    tokens: List[str]
-    top_logprobs: List[Optional[Dict[str, float]]]
+    text_offset: list[int]
+    token_logprobs: list[float | None]
+    tokens: list[str]
+    top_logprobs: list[dict[str, float] | None]
 
 
 class CompletionChoice(TypedDict):
     text: str
     index: int
-    logprobs: Optional[CompletionLogprobs]
-    finish_reason: Optional[Literal["stop", "length"]]
+    logprobs: CompletionLogprobs | None
+    finish_reason: Literal["stop", "length"] | None
 
 
 class CompletionUsage(TypedDict):
@@ -60,7 +60,7 @@ class CreateCompletionResponse(TypedDict):
     object: Literal["text_completion"]
     created: int
     model: str
-    choices: List[CompletionChoice]
+    choices: list[CompletionChoice]
     usage: NotRequired[CompletionUsage]
 
 
@@ -70,7 +70,7 @@ class ChatCompletionResponseFunctionCall(TypedDict):
 
 
 class ChatCompletionResponseMessage(TypedDict):
-    content: Optional[str]
+    content: str | None
     tool_calls: NotRequired["ChatCompletionMessageToolCalls"]
     role: Literal["assistant", "function"]  # NOTE: "function" may be incorrect here
     function_call: NotRequired[ChatCompletionResponseFunctionCall]  # DEPRECATED
@@ -79,14 +79,14 @@ class ChatCompletionResponseMessage(TypedDict):
 class ChatCompletionFunction(TypedDict):
     name: str
     description: NotRequired[str]
-    parameters: Dict[str, JsonType]  # TODO: make this more specific
+    parameters: dict[str, JsonType]  # TODO: make this more specific
 
 
 class ChatCompletionResponseChoice(TypedDict):
     index: int
     message: "ChatCompletionResponseMessage"
-    logprobs: Optional[CompletionLogprobs]
-    finish_reason: Optional[str]
+    logprobs: CompletionLogprobs | None
+    finish_reason: str | None
 
 
 class CreateChatCompletionResponse(TypedDict):
@@ -94,12 +94,12 @@ class CreateChatCompletionResponse(TypedDict):
     object: Literal["chat.completion"]
     created: int
     model: str
-    choices: List["ChatCompletionResponseChoice"]
+    choices: list["ChatCompletionResponseChoice"]
     usage: CompletionUsage
 
 
 class ChatCompletionMessageToolCallChunkFunction(TypedDict):
-    name: Optional[str]
+    name: str | None
     arguments: str
 
 
@@ -120,21 +120,19 @@ class ChatCompletionStreamResponseDeltaFunctionCall(TypedDict):
 
 
 class ChatCompletionStreamResponseDelta(TypedDict):
-    content: NotRequired[Optional[str]]
+    content: NotRequired[str | None]
     function_call: NotRequired[
-        Optional[ChatCompletionStreamResponseDeltaFunctionCall]
+        ChatCompletionStreamResponseDeltaFunctionCall | None
     ]  # DEPRECATED
-    tool_calls: NotRequired[Optional[List[ChatCompletionMessageToolCallChunk]]]
-    role: NotRequired[Optional[Literal["system", "user", "assistant", "tool"]]]
+    tool_calls: NotRequired[list[ChatCompletionMessageToolCallChunk] | None]
+    role: NotRequired[Literal["system", "user", "assistant", "tool"] | None]
 
 
 class ChatCompletionStreamResponseChoice(TypedDict):
     index: int
-    delta: Union[
-        ChatCompletionStreamResponseDelta, ChatCompletionStreamResponseDeltaEmpty,
-    ]
-    finish_reason: Optional[Literal["stop", "length", "tool_calls", "function_call"]]
-    logprobs: NotRequired[Optional[CompletionLogprobs]]
+    delta: ChatCompletionStreamResponseDelta | ChatCompletionStreamResponseDeltaEmpty
+    finish_reason: Literal["stop", "length", "tool_calls", "function_call"] | None
+    logprobs: NotRequired[CompletionLogprobs | None]
 
 
 class CreateChatCompletionStreamResponse(TypedDict):
@@ -142,13 +140,13 @@ class CreateChatCompletionStreamResponse(TypedDict):
     model: str
     object: Literal["chat.completion.chunk"]
     created: int
-    choices: List[ChatCompletionStreamResponseChoice]
+    choices: list[ChatCompletionStreamResponseChoice]
 
 
 class ChatCompletionFunctions(TypedDict):
     name: str
     description: NotRequired[str]
-    parameters: Dict[str, JsonType]  # TODO: make this more specific
+    parameters: dict[str, JsonType]  # TODO: make this more specific
 
 
 class ChatCompletionFunctionCallOption(TypedDict):
@@ -174,7 +172,7 @@ class ChatCompletionRequestMessageContentPartImageImageUrl(TypedDict):
 
 class ChatCompletionRequestMessageContentPartImage(TypedDict):
     type: Literal["image_url"]
-    image_url: Union[str, ChatCompletionRequestMessageContentPartImageImageUrl]
+    image_url: str | ChatCompletionRequestMessageContentPartImageImageUrl
 
 
 ChatCompletionRequestMessageContentPart = Union[
@@ -185,12 +183,12 @@ class ChatCompletionRequestMessageContentPartImage(TypedDict):
 
 class ChatCompletionRequestSystemMessage(TypedDict):
     role: Literal["system"]
-    content: Optional[str]
+    content: str | None
 
 
 class ChatCompletionRequestUserMessage(TypedDict):
     role: Literal["user"]
-    content: Optional[Union[str, List[ChatCompletionRequestMessageContentPart]]]
+    content: str | list[ChatCompletionRequestMessageContentPart] | None
 
 
 class ChatCompletionMessageToolCallFunction(TypedDict):
@@ -204,7 +202,7 @@ class ChatCompletionMessageToolCall(TypedDict):
     function: ChatCompletionMessageToolCallFunction
 
 
-ChatCompletionMessageToolCalls = List[ChatCompletionMessageToolCall]
+ChatCompletionMessageToolCalls = list[ChatCompletionMessageToolCall]
 
 
 class ChatCompletionRequestAssistantMessageFunctionCall(TypedDict):
@@ -214,7 +212,7 @@ class ChatCompletionRequestAssistantMessageFunctionCall(TypedDict):
 
 class ChatCompletionRequestAssistantMessage(TypedDict):
     role: Literal["assistant"]
-    content: Optional[str]
+    content: str | None
     tool_calls: NotRequired[ChatCompletionMessageToolCalls]
     function_call: NotRequired[
         ChatCompletionRequestAssistantMessageFunctionCall
@@ -223,13 +221,13 @@ class ChatCompletionRequestAssistantMessage(TypedDict):
 
 class ChatCompletionRequestToolMessage(TypedDict):
     role: Literal["tool"]
-    content: Optional[str]
+    content: str | None
     tool_call_id: str
 
 
 class ChatCompletionRequestFunctionMessage(TypedDict):
     role: Literal["function"]
-    content: Optional[str]
+    content: str | None
     name: str
 
 
@@ -251,7 +249,7 @@ class ChatCompletionRequestFunctionCallOption(TypedDict):
     Literal["none", "auto"], ChatCompletionRequestFunctionCallOption,
 ]
 
-ChatCompletionFunctionParameters = Dict[str, JsonType]  # TODO: make this more specific
+ChatCompletionFunctionParameters = dict[str, JsonType]  # TODO: make this more specific
 
 
 class ChatCompletionToolFunction(TypedDict):
diff --git a/llama_cpp/llava_cpp.py b/llama_cpp/llava_cpp.py
index cf09005d6..cc1a3210a 100644
--- a/llama_cpp/llava_cpp.py
+++ b/llama_cpp/llava_cpp.py
@@ -5,6 +5,7 @@
 import os
 import pathlib
 import sys
+from collections.abc import Callable
 from ctypes import (
     POINTER,
     Structure,
@@ -19,17 +20,15 @@
 from typing import (
     TYPE_CHECKING,
     Any,
-    Callable,
     Generic,
     List,
     NewType,
     Optional,
+    TypeAlias,
     TypeVar,
     Union,
 )
 
-from typing_extensions import TypeAlias
-
 from llama_cpp import llama_cpp
 
 
@@ -39,7 +38,7 @@ def _load_shared_library(lib_base_name: str):
     _base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib"
     # Searching for the library in the current directory under the name "libllama" (default name
     # for llamacpp) and "llama" (default name for this repo)
-    _lib_paths: List[pathlib.Path] = []
+    _lib_paths: list[pathlib.Path] = []
     # Determine the file extension based on the platform
     if sys.platform.startswith("linux"):
         _lib_paths += [
@@ -106,9 +105,7 @@ def _load_shared_library(lib_base_name: str):
     class CtypesRef(Generic[CtypesCData]):
         pass
 
-    CtypesPointerOrRef: TypeAlias = Union[
-        CtypesPointer[CtypesCData], CtypesRef[CtypesCData],
-    ]
+    CtypesPointerOrRef: TypeAlias = CtypesPointer[CtypesCData] | CtypesRef[CtypesCData]
 
     CtypesFuncPointer: TypeAlias = ctypes._FuncPointer  # type: ignore
 
@@ -117,7 +114,7 @@ class CtypesRef(Generic[CtypesCData]):
 
 def ctypes_function_for_shared_library(lib: ctypes.CDLL):
     def ctypes_function(
-        name: str, argtypes: List[Any], restype: Any, enabled: bool = True,
+        name: str, argtypes: list[Any], restype: Any, enabled: bool = True,
     ):
         def decorator(f: F) -> F:
             if enabled:
@@ -178,9 +175,9 @@ def llava_validate_embed_size(
 )
 def llava_image_embed_make_with_bytes(
     ctx_clip: clip_ctx_p,
-    n_threads: Union[c_int, int],
+    n_threads: c_int | int,
     image_bytes: CtypesArray[c_uint8],
-    image_bytes_length: Union[c_int, int],
+    image_bytes_length: c_int | int,
     /,
 ) -> _Pointer[llava_image_embed]: ...
 
@@ -193,7 +190,7 @@ def llava_image_embed_make_with_bytes(
     POINTER(llava_image_embed),
 )
 def llava_image_embed_make_with_filename(
-    ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], image_path: bytes, /,
+    ctx_clip: clip_ctx_p, n_threads: c_int | int, image_path: bytes, /,
 ) -> _Pointer[llava_image_embed]: ...
 
 
@@ -218,7 +215,7 @@ def llava_image_embed_free(embed: _Pointer[llava_image_embed], /): ...
 def llava_eval_image_embed(
     ctx_llama: llama_cpp.llama_context_p,
     embed: _Pointer[llava_image_embed],
-    n_batch: Union[c_int, int],
+    n_batch: c_int | int,
     n_past: _Pointer[c_int],
     /,
 ) -> bool: ...
@@ -233,8 +230,8 @@ def llava_eval_image_embed(
 # CLIP_API struct clip_ctx * clip_model_load    (const char * fname, int verbosity);
 @ctypes_function("clip_model_load", [c_char_p, c_int], clip_ctx_p_ctypes)
 def clip_model_load(
-    fname: bytes, verbosity: Union[c_int, int], /,
-) -> Optional[clip_ctx_p]: ...
+    fname: bytes, verbosity: c_int | int, /,
+) -> clip_ctx_p | None: ...
 
 
 # /** free mmproj model */
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 040ebca7b..6afb0ec8b 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -4,9 +4,10 @@
 import json
 import os
 import typing
+from collections.abc import Iterator
 from functools import partial
 from threading import Lock
-from typing import Dict, Iterator, List, Optional, Union
+from typing import Dict, List, Optional, Union
 
 import anyio
 from anyio.streams.memory import MemoryObjectSendStream
@@ -44,7 +45,7 @@
 
 router = APIRouter(route_class=RouteErrorHandler)
 
-_server_settings: Optional[ServerSettings] = None
+_server_settings: ServerSettings | None = None
 
 
 def set_server_settings(server_settings: ServerSettings):
@@ -56,13 +57,13 @@ def get_server_settings():
     yield _server_settings
 
 
-_llama_proxy: Optional[LlamaProxy] = None
+_llama_proxy: LlamaProxy | None = None
 
 llama_outer_lock = Lock()
 llama_inner_lock = Lock()
 
 
-def set_llama_proxy(model_settings: List[ModelSettings]):
+def set_llama_proxy(model_settings: list[ModelSettings]):
     global _llama_proxy
     _llama_proxy = LlamaProxy(models=model_settings)
 
@@ -86,7 +87,7 @@ def get_llama_proxy():
             llama_outer_lock.release()
 
 
-_ping_message_factory: typing.Optional[typing.Callable[[], bytes]] = None
+_ping_message_factory: typing.Callable[[], bytes] | None = None
 
 
 def set_ping_message_factory(factory: typing.Callable[[], bytes]):
@@ -97,7 +98,7 @@ def set_ping_message_factory(factory: typing.Callable[[], bytes]):
 def create_app(
     settings: Settings | None = None,
     server_settings: ServerSettings | None = None,
-    model_settings: List[ModelSettings] | None = None,
+    model_settings: list[ModelSettings] | None = None,
 ):
     config_file = os.environ.get("CONFIG_FILE", None)
     if config_file is not None:
@@ -156,7 +157,7 @@ async def get_event_publisher(
     request: Request,
     inner_send_chan: MemoryObjectSendStream[typing.Any],
     iterator: Iterator[typing.Any],
-    on_complete: typing.Optional[typing.Callable[[], None]] = None,
+    on_complete: typing.Callable[[], None] | None = None,
 ):
     server_settings = next(get_server_settings())
     interrupt_requests = (
@@ -184,9 +185,9 @@ async def get_event_publisher(
 
 def _logit_bias_tokens_to_input_ids(
     llama: llama_cpp.Llama,
-    logit_bias: Dict[str, float],
-) -> Dict[str, float]:
-    to_bias: Dict[str, float] = {}
+    logit_bias: dict[str, float],
+) -> dict[str, float]:
+    to_bias: dict[str, float] = {}
     for token, score in logit_bias.items():
         token = token.encode("utf-8")
         for input_id in llama.tokenize(token, add_bos=False, special=True):
@@ -200,7 +201,7 @@ def _logit_bias_tokens_to_input_ids(
 
 async def authenticate(
     settings: Settings = Depends(get_server_settings),
-    authorization: Optional[str] = Depends(bearer_scheme),
+    authorization: str | None = Depends(bearer_scheme),
 ):
     # Skip API key check if it's not set in settings
     if settings.api_key is None:
@@ -311,10 +312,7 @@ async def create_completion(
         else:
             kwargs["logits_processor"].extend(_min_tokens_logits_processor)
 
-    iterator_or_completion: Union[
-        llama_cpp.CreateCompletionResponse,
-        Iterator[llama_cpp.CreateCompletionStreamResponse],
-    ] = await run_in_threadpool(llama, **kwargs)
+    iterator_or_completion: llama_cpp.CreateCompletionResponse | Iterator[llama_cpp.CreateCompletionStreamResponse] = await run_in_threadpool(llama, **kwargs)
 
     if isinstance(iterator_or_completion, Iterator):
         # EAFP: It's easier to ask for forgiveness than permission
@@ -504,9 +502,7 @@ async def create_chat_completion(
         else:
             kwargs["logits_processor"].extend(_min_tokens_logits_processor)
 
-    iterator_or_completion: Union[
-        llama_cpp.ChatCompletion, Iterator[llama_cpp.ChatCompletionChunk],
-    ] = await run_in_threadpool(llama.create_chat_completion, **kwargs)
+    iterator_or_completion: llama_cpp.ChatCompletion | Iterator[llama_cpp.ChatCompletionChunk] = await run_in_threadpool(llama.create_chat_completion, **kwargs)
 
     if isinstance(iterator_or_completion, Iterator):
         # EAFP: It's easier to ask for forgiveness than permission
diff --git a/llama_cpp/server/cli.py b/llama_cpp/server/cli.py
index c8bb46fa6..763ef73dc 100644
--- a/llama_cpp/server/cli.py
+++ b/llama_cpp/server/cli.py
@@ -6,35 +6,34 @@
 from pydantic import BaseModel
 
 
-def _get_base_type(annotation: Type[Any]) -> Type[Any]:
+def _get_base_type(annotation: type[Any]) -> type[Any]:
     if getattr(annotation, "__origin__", None) is Literal:
         assert hasattr(annotation, "__args__") and len(annotation.__args__) >= 1  # type: ignore
         return type(annotation.__args__[0])  # type: ignore
-    elif getattr(annotation, "__origin__", None) is Union:
+    if getattr(annotation, "__origin__", None) is Union:
         assert hasattr(annotation, "__args__") and len(annotation.__args__) >= 1  # type: ignore
-        non_optional_args: List[Type[Any]] = [
+        non_optional_args: list[type[Any]] = [
             arg for arg in annotation.__args__ if arg is not type(None)  # type: ignore
         ]
         if non_optional_args:
             return _get_base_type(non_optional_args[0])
     elif (
         getattr(annotation, "__origin__", None) is list
-        or getattr(annotation, "__origin__", None) is List
+        or getattr(annotation, "__origin__", None) is list
     ):
         assert hasattr(annotation, "__args__") and len(annotation.__args__) >= 1  # type: ignore
         return _get_base_type(annotation.__args__[0])  # type: ignore
     return annotation
 
 
-def _contains_list_type(annotation: Type[Any] | None) -> bool:
+def _contains_list_type(annotation: type[Any] | None) -> bool:
     origin = getattr(annotation, "__origin__", None)
 
-    if origin is list or origin is List:
+    if origin is list or origin is list:
         return True
-    elif origin in (Literal, Union):
+    if origin in (Literal, Union):
         return any(_contains_list_type(arg) for arg in annotation.__args__)  # type: ignore
-    else:
-        return False
+    return False
 
 
 def _parse_bool_arg(arg: str | bytes | bool) -> bool:
@@ -48,13 +47,12 @@ def _parse_bool_arg(arg: str | bytes | bool) -> bool:
 
     if arg_str in true_values:
         return True
-    elif arg_str in false_values:
+    if arg_str in false_values:
         return False
-    else:
-        raise ValueError(f"Invalid boolean argument: {arg}")
+    raise ValueError(f"Invalid boolean argument: {arg}")
 
 
-def add_args_from_model(parser: argparse.ArgumentParser, model: Type[BaseModel]):
+def add_args_from_model(parser: argparse.ArgumentParser, model: type[BaseModel]):
     """Add arguments from a pydantic model to an argparse parser."""
 
     for name, field in model.model_fields.items():
@@ -82,7 +80,7 @@ def add_args_from_model(parser: argparse.ArgumentParser, model: Type[BaseModel])
             )
 
 
-T = TypeVar("T", bound=Type[BaseModel])
+T = TypeVar("T", bound=type[BaseModel])
 
 
 def parse_model_from_args(model: T, args: argparse.Namespace) -> T:
diff --git a/llama_cpp/server/errors.py b/llama_cpp/server/errors.py
index 03926e292..f50154c81 100644
--- a/llama_cpp/server/errors.py
+++ b/llama_cpp/server/errors.py
@@ -3,8 +3,9 @@
 import sys
 import time
 import traceback
+from collections.abc import Callable, Coroutine
 from re import Match, Pattern, compile
-from typing import Callable, Coroutine, Dict, Optional, Tuple, Union
+from typing import Dict, Optional, Tuple, Union
 
 from fastapi import (
     HTTPException,
@@ -27,8 +28,8 @@ class ErrorResponse(TypedDict):
 
     message: str
     type: str
-    param: Optional[str]
-    code: Optional[str]
+    param: str | None
+    code: str | None
 
 
 class ErrorResponseFormatters:
@@ -45,9 +46,9 @@ class ErrorResponseFormatters:
 
     @staticmethod
     def context_length_exceeded(
-        request: Union[CreateCompletionRequest, CreateChatCompletionRequest],
+        request: CreateCompletionRequest | CreateChatCompletionRequest,
         match,  # type: Match[str] # type: ignore
-    ) -> Tuple[int, ErrorResponse]:
+    ) -> tuple[int, ErrorResponse]:
         """Formatter for context length exceeded error"""
 
         context_window = int(match.group(2))
@@ -83,9 +84,9 @@ def context_length_exceeded(
 
     @staticmethod
     def model_not_found(
-        request: Union[CreateCompletionRequest, CreateChatCompletionRequest],
+        request: CreateCompletionRequest | CreateChatCompletionRequest,
         match,  # type: Match[str] # type: ignore
-    ) -> Tuple[int, ErrorResponse]:
+    ) -> tuple[int, ErrorResponse]:
         """Formatter for model_not_found error"""
 
         model_path = str(match.group(1))
@@ -103,14 +104,14 @@ class RouteErrorHandler(APIRoute):
 
     # key: regex pattern for original error message from llama_cpp
     # value: formatter function
-    pattern_and_formatters: Dict[
+    pattern_and_formatters: dict[
         Pattern[str],
         Callable[
             [
-                Union[CreateCompletionRequest, CreateChatCompletionRequest],
+                CreateCompletionRequest | CreateChatCompletionRequest,
                 Match[str],
             ],
-            Tuple[int, ErrorResponse],
+            tuple[int, ErrorResponse],
         ],
     ] = {
         compile(
@@ -124,14 +125,8 @@ class RouteErrorHandler(APIRoute):
     def error_message_wrapper(
         self,
         error: Exception,
-        body: Optional[
-            Union[
-                CreateChatCompletionRequest,
-                CreateCompletionRequest,
-                CreateEmbeddingRequest,
-            ]
-        ] = None,
-    ) -> Tuple[int, ErrorResponse]:
+        body: CreateChatCompletionRequest | CreateCompletionRequest | CreateEmbeddingRequest | None = None,
+    ) -> tuple[int, ErrorResponse]:
         """Wraps error message in OpenAI style error response"""
         print(f"Exception: {error!s}", file=sys.stderr)
         traceback.print_exc(file=sys.stderr)
@@ -179,13 +174,7 @@ async def custom_route_handler(request: Request) -> Response:
                 try:
                     if "messages" in json_body:
                         # Chat completion
-                        body: Optional[
-                            Union[
-                                CreateChatCompletionRequest,
-                                CreateCompletionRequest,
-                                CreateEmbeddingRequest,
-                            ]
-                        ] = CreateChatCompletionRequest(**json_body)
+                        body: CreateChatCompletionRequest | CreateCompletionRequest | CreateEmbeddingRequest | None = CreateChatCompletionRequest(**json_body)
                     elif "prompt" in json_body:
                         # Text completion
                         body = CreateCompletionRequest(**json_body)
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index 4ce33814f..747343016 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -9,7 +9,7 @@
 
 
 class LlamaProxy:
-    def __init__(self, models: List[ModelSettings]) -> None:
+    def __init__(self, models: list[ModelSettings]) -> None:
         assert len(models) > 0, "No models provided!"
 
         self._model_settings_dict: dict[str, ModelSettings] = {}
@@ -18,8 +18,8 @@ def __init__(self, models: List[ModelSettings]) -> None:
                 model.model_alias = model.model
             self._model_settings_dict[model.model_alias] = model
 
-        self._current_model: Optional[llama_cpp.Llama] = None
-        self._current_model_alias: Optional[str] = None
+        self._current_model: llama_cpp.Llama | None = None
+        self._current_model_alias: str | None = None
 
         self._default_model_settings: ModelSettings = models[0]
         self._default_model_alias: str = self._default_model_settings.model_alias  # type: ignore
@@ -30,7 +30,7 @@ def __init__(self, models: List[ModelSettings]) -> None:
         )
         self._current_model_alias = self._default_model_alias
 
-    def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama:
+    def __call__(self, model: str | None = None) -> llama_cpp.Llama:
         if model is None:
             model = self._default_model_alias
 
@@ -53,7 +53,7 @@ def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama:
     def __getitem__(self, model: str):
         return self._model_settings_dict[model].model_dump()
 
-    def __setitem__(self, model: str, settings: Union[ModelSettings, str, bytes]):
+    def __setitem__(self, model: str, settings: ModelSettings | str | bytes):
         if isinstance(settings, (bytes, str)):
             settings = ModelSettings.model_validate_json(settings)
         self._model_settings_dict[model] = settings
@@ -171,7 +171,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
                 json.load(open(settings.hf_tokenizer_config_path)),
             )
 
-        tokenizer: Optional[llama_cpp.BaseLlamaTokenizer] = None
+        tokenizer: llama_cpp.BaseLlamaTokenizer | None = None
         if settings.hf_pretrained_model_name_or_path is not None:
             tokenizer = llama_tokenizer.LlamaHFTokenizer.from_pretrained(
                 settings.hf_pretrained_model_name_or_path,
@@ -183,7 +183,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
                 num_pred_tokens=settings.draft_model_num_pred_tokens,
             )
 
-        kv_overrides: Optional[Dict[str, Union[bool, int, float, str]]] = None
+        kv_overrides: dict[str, bool | int | float | str] | None = None
         if settings.kv_overrides is not None:
             assert isinstance(settings.kv_overrides, list)
             kv_overrides = {}
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
index 848b93869..5f5747265 100644
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@@ -1,11 +1,10 @@
 from __future__ import annotations
 
 import multiprocessing
-from typing import Dict, List, Literal, Optional, Union, cast
+from typing import Dict, List, Literal, Optional, Self, Union, cast
 
 from pydantic import Field, model_validator
 from pydantic_settings import BaseSettings
-from typing_extensions import Self
 
 import llama_cpp
 
@@ -19,7 +18,7 @@ class ModelSettings(BaseSettings):
     model: str = Field(
         description="The path to the model to use for generating completions.",
     )
-    model_alias: Optional[str] = Field(
+    model_alias: str | None = Field(
         default=None,
         description="The alias of the model to use for generating completions.",
     )
@@ -38,7 +37,7 @@ class ModelSettings(BaseSettings):
         ge=0,
         description="Main GPU to use.",
     )
-    tensor_split: Optional[List[float]] = Field(
+    tensor_split: list[float] | None = Field(
         default=None,
         description="Split layers across multiple GPUs in proportion.",
     )
@@ -53,11 +52,11 @@ class ModelSettings(BaseSettings):
         default=llama_cpp.llama_supports_mlock(),
         description="Use mlock.",
     )
-    kv_overrides: Optional[List[str]] = Field(
+    kv_overrides: list[str] | None = Field(
         default=None,
         description="List of model kv overrides in the format key=type:value where type is one of (bool, int, float). Valid true values are (true, TRUE, 1), otherwise false.",
     )
-    rpc_servers: Optional[str] = Field(
+    rpc_servers: str | None = Field(
         default=None,
         description="comma seperated list of rpc servers for offloading",
     )
@@ -109,25 +108,25 @@ class ModelSettings(BaseSettings):
         description="Last n tokens to keep for repeat penalty calculation.",
     )
     # LoRA Params
-    lora_base: Optional[str] = Field(
+    lora_base: str | None = Field(
         default=None,
         description="Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.",
     )
-    lora_path: Optional[str] = Field(
+    lora_path: str | None = Field(
         default=None,
         description="Path to a LoRA file to apply to the model.",
     )
     # Backend Params
-    numa: Union[bool, int] = Field(
+    numa: bool | int = Field(
         default=False,
         description="Enable NUMA support.",
     )
     # Chat Format Params
-    chat_format: Optional[str] = Field(
+    chat_format: str | None = Field(
         default=None,
         description="Chat format to use.",
     )
-    clip_model_path: Optional[str] = Field(
+    clip_model_path: str | None = Field(
         default=None,
         description="Path to a CLIP model to use for multi-modal chat completion.",
     )
@@ -145,21 +144,21 @@ class ModelSettings(BaseSettings):
         description="The size of the cache in bytes. Only used if cache is True.",
     )
     # Tokenizer Options
-    hf_tokenizer_config_path: Optional[str] = Field(
+    hf_tokenizer_config_path: str | None = Field(
         default=None,
         description="The path to a HuggingFace tokenizer_config.json file.",
     )
-    hf_pretrained_model_name_or_path: Optional[str] = Field(
+    hf_pretrained_model_name_or_path: str | None = Field(
         default=None,
         description="The model name or path to a pretrained HuggingFace tokenizer model. Same as you would pass to AutoTokenizer.from_pretrained().",
     )
     # Loading from HuggingFace Model Hub
-    hf_model_repo_id: Optional[str] = Field(
+    hf_model_repo_id: str | None = Field(
         default=None,
         description="The model repo id to use for the HuggingFace tokenizer model.",
     )
     # Speculative Decoding
-    draft_model: Optional[str] = Field(
+    draft_model: str | None = Field(
         default=None,
         description="Method to use for speculative decoding. One of (prompt-lookup-decoding).",
     )
@@ -168,11 +167,11 @@ class ModelSettings(BaseSettings):
         description="Number of tokens to predict using the draft model.",
     )
     # KV Cache Quantization
-    type_k: Optional[int] = Field(
+    type_k: int | None = Field(
         default=None,
         description="Type of the key cache quantization.",
     )
-    type_v: Optional[int] = Field(
+    type_v: int | None = Field(
         default=None,
         description="Type of the value cache quantization.",
     )
@@ -187,7 +186,7 @@ class ModelSettings(BaseSettings):
     def set_dynamic_defaults(self) -> Self:
         # If n_threads or n_threads_batch is -1, set it to multiprocessing.cpu_count()
         cpu_count = multiprocessing.cpu_count()
-        values = cast(Dict[str, int], self)
+        values = cast(dict[str, int], self)
         if values.get("n_threads", 0) == -1:
             values["n_threads"] = cpu_count
         if values.get("n_threads_batch", 0) == -1:
@@ -201,14 +200,14 @@ class ServerSettings(BaseSettings):
     # Uvicorn Settings
     host: str = Field(default="localhost", description="Listen address")
     port: int = Field(default=8000, description="Listen port")
-    ssl_keyfile: Optional[str] = Field(
+    ssl_keyfile: str | None = Field(
         default=None, description="SSL key file for HTTPS",
     )
-    ssl_certfile: Optional[str] = Field(
+    ssl_certfile: str | None = Field(
         default=None, description="SSL certificate file for HTTPS",
     )
     # FastAPI Settings
-    api_key: Optional[str] = Field(
+    api_key: str | None = Field(
         default=None,
         description="API key for authentication. If set all requests need to be authenticated.",
     )
@@ -233,4 +232,4 @@ class Settings(ServerSettings, ModelSettings):
 class ConfigFileSettings(ServerSettings):
     """Configuration file format settings."""
 
-    models: List[ModelSettings] = Field(default=[], description="Model configs")
+    models: list[ModelSettings] = Field(default=[], description="Model configs")
diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py
index 4b4298d06..5943c6613 100644
--- a/llama_cpp/server/types.py
+++ b/llama_cpp/server/types.py
@@ -1,9 +1,9 @@
 from __future__ import annotations
 
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Literal, Optional, Union
 
 from pydantic import BaseModel, Field
-from typing_extensions import Literal, TypedDict
+from typing_extensions import TypedDict
 
 import llama_cpp
 
@@ -106,14 +106,14 @@
 
 
 class CreateCompletionRequest(BaseModel):
-    prompt: Union[str, List[str]] = Field(
+    prompt: str | list[str] = Field(
         default="", description="The prompt to generate completions for.",
     )
-    suffix: Optional[str] = Field(
+    suffix: str | None = Field(
         default=None,
         description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots.",
     )
-    max_tokens: Optional[int] = Field(
+    max_tokens: int | None = Field(
         default=16, ge=0, description="The maximum number of tokens to generate.",
     )
     min_tokens: int = min_tokens_field
@@ -124,32 +124,32 @@ class CreateCompletionRequest(BaseModel):
         default=False,
         description="Whether to echo the prompt in the generated text. Useful for chatbots.",
     )
-    stop: Optional[Union[str, List[str]]] = stop_field
+    stop: str | list[str] | None = stop_field
     stream: bool = stream_field
-    logprobs: Optional[int] = Field(
+    logprobs: int | None = Field(
         default=None,
         ge=0,
         description="The number of logprobs to generate. If None, no logprobs are generated.",
     )
-    presence_penalty: Optional[float] = presence_penalty_field
-    frequency_penalty: Optional[float] = frequency_penalty_field
-    logit_bias: Optional[Dict[str, float]] = Field(None)
-    seed: Optional[int] = Field(None)
+    presence_penalty: float | None = presence_penalty_field
+    frequency_penalty: float | None = frequency_penalty_field
+    logit_bias: dict[str, float] | None = Field(None)
+    seed: int | None = Field(None)
 
     # ignored or currently unsupported
-    model: Optional[str] = model_field
-    n: Optional[int] = 1
-    best_of: Optional[int] = 1
-    user: Optional[str] = Field(default=None)
+    model: str | None = model_field
+    n: int | None = 1
+    best_of: int | None = 1
+    user: str | None = Field(default=None)
 
     # llama.cpp specific parameters
     top_k: int = top_k_field
     repeat_penalty: float = repeat_penalty_field
-    logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
+    logit_bias_type: Literal["input_ids", "tokens"] | None = Field(None)
     mirostat_mode: int = mirostat_mode_field
     mirostat_tau: float = mirostat_tau_field
     mirostat_eta: float = mirostat_eta_field
-    grammar: Optional[str] = None
+    grammar: str | None = None
 
     model_config = {
         "json_schema_extra": {
@@ -164,9 +164,9 @@ class CreateCompletionRequest(BaseModel):
 
 
 class CreateEmbeddingRequest(BaseModel):
-    model: Optional[str] = model_field
-    input: Union[str, List[str]] = Field(description="The input to embed.")
-    user: Optional[str] = Field(default=None)
+    model: str | None = model_field
+    input: str | list[str] = Field(description="The input to embed.")
+    user: str | None = Field(default=None)
 
     model_config = {
         "json_schema_extra": {
@@ -183,41 +183,41 @@ class ChatCompletionRequestMessage(BaseModel):
     role: Literal["system", "user", "assistant", "function"] = Field(
         default="user", description="The role of the message.",
     )
-    content: Optional[str] = Field(
+    content: str | None = Field(
         default="", description="The content of the message.",
     )
 
 
 class CreateChatCompletionRequest(BaseModel):
-    messages: List[llama_cpp.ChatCompletionRequestMessage] = Field(
+    messages: list[llama_cpp.ChatCompletionRequestMessage] = Field(
         default=[], description="A list of messages to generate completions for.",
     )
-    functions: Optional[List[llama_cpp.ChatCompletionFunction]] = Field(
+    functions: list[llama_cpp.ChatCompletionFunction] | None = Field(
         default=None,
         description="A list of functions to apply to the generated completions.",
     )
-    function_call: Optional[llama_cpp.ChatCompletionRequestFunctionCall] = Field(
+    function_call: llama_cpp.ChatCompletionRequestFunctionCall | None = Field(
         default=None,
         description="A function to apply to the generated completions.",
     )
-    tools: Optional[List[llama_cpp.ChatCompletionTool]] = Field(
+    tools: list[llama_cpp.ChatCompletionTool] | None = Field(
         default=None,
         description="A list of tools to apply to the generated completions.",
     )
-    tool_choice: Optional[llama_cpp.ChatCompletionToolChoiceOption] = Field(
+    tool_choice: llama_cpp.ChatCompletionToolChoiceOption | None = Field(
         default=None,
         description="A tool to apply to the generated completions.",
     )  # TODO: verify
-    max_tokens: Optional[int] = Field(
+    max_tokens: int | None = Field(
         default=None,
         description="The maximum number of tokens to generate. Defaults to inf",
     )
     min_tokens: int = min_tokens_field
-    logprobs: Optional[bool] = Field(
+    logprobs: bool | None = Field(
         default=False,
         description="Whether to output the logprobs or not. Default is True",
     )
-    top_logprobs: Optional[int] = Field(
+    top_logprobs: int | None = Field(
         default=None,
         ge=0,
         description="The number of logprobs to generate. If None, no logprobs are generated. logprobs need to set to True.",
@@ -225,29 +225,29 @@ class CreateChatCompletionRequest(BaseModel):
     temperature: float = temperature_field
     top_p: float = top_p_field
     min_p: float = min_p_field
-    stop: Optional[Union[str, List[str]]] = stop_field
+    stop: str | list[str] | None = stop_field
     stream: bool = stream_field
-    presence_penalty: Optional[float] = presence_penalty_field
-    frequency_penalty: Optional[float] = frequency_penalty_field
-    logit_bias: Optional[Dict[str, float]] = Field(None)
-    seed: Optional[int] = Field(None)
-    response_format: Optional[llama_cpp.ChatCompletionRequestResponseFormat] = Field(
+    presence_penalty: float | None = presence_penalty_field
+    frequency_penalty: float | None = frequency_penalty_field
+    logit_bias: dict[str, float] | None = Field(None)
+    seed: int | None = Field(None)
+    response_format: llama_cpp.ChatCompletionRequestResponseFormat | None = Field(
         default=None,
     )
 
     # ignored or currently unsupported
-    model: Optional[str] = model_field
-    n: Optional[int] = 1
-    user: Optional[str] = Field(None)
+    model: str | None = model_field
+    n: int | None = 1
+    user: str | None = Field(None)
 
     # llama.cpp specific parameters
     top_k: int = top_k_field
     repeat_penalty: float = repeat_penalty_field
-    logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
+    logit_bias_type: Literal["input_ids", "tokens"] | None = Field(None)
     mirostat_mode: int = mirostat_mode_field
     mirostat_tau: float = mirostat_tau_field
     mirostat_eta: float = mirostat_eta_field
-    grammar: Optional[str] = None
+    grammar: str | None = None
 
     model_config = {
         "json_schema_extra": {
@@ -271,16 +271,16 @@ class ModelData(TypedDict):
     id: str
     object: Literal["model"]
     owned_by: str
-    permissions: List[str]
+    permissions: list[str]
 
 
 class ModelList(TypedDict):
     object: Literal["list"]
-    data: List[ModelData]
+    data: list[ModelData]
 
 
 class TokenizeInputRequest(BaseModel):
-    model: Optional[str] = model_field
+    model: str | None = model_field
     input: str = Field(description="The input to tokenize.")
 
     model_config = {
@@ -289,7 +289,7 @@ class TokenizeInputRequest(BaseModel):
 
 
 class TokenizeInputResponse(BaseModel):
-    tokens: List[int] = Field(description="A list of tokens.")
+    tokens: list[int] = Field(description="A list of tokens.")
 
     model_config = {"json_schema_extra": {"example": {"tokens": [123, 321, 222]}}}
 
@@ -301,8 +301,8 @@ class TokenizeInputCountResponse(BaseModel):
 
 
 class DetokenizeInputRequest(BaseModel):
-    model: Optional[str] = model_field
-    tokens: List[int] = Field(description="A list of toekns to detokenize.")
+    model: str | None = model_field
+    tokens: list[int] = Field(description="A list of toekns to detokenize.")
 
     model_config = {"json_schema_extra": {"example": [{"tokens": [123, 321, 222]}]}}
 
diff --git a/tests/test_llama.py b/tests/test_llama.py
index 469ef91ca..abbedb0ab 100644
--- a/tests/test_llama.py
+++ b/tests/test_llama.py
@@ -101,7 +101,6 @@ def mock_get_logits(ctx: llama_cpp.llama_context_p):
         def mock_kv_cache_clear(ctx: llama_cpp.llama_context_p):
             # Test some basic invariants of this mocking technique
             assert ctx == llama._ctx.ctx, "context does not match mock_llama"
-            return
 
         def mock_kv_cache_seq_rm(
             ctx: llama_cpp.llama_context_p,
@@ -111,7 +110,6 @@ def mock_kv_cache_seq_rm(
         ):
             # Test some basic invariants of this mocking technique
             assert ctx == llama._ctx.ctx, "context does not match mock_llama"
-            return
 
         def mock_kv_cache_seq_cp(
             ctx: llama_cpp.llama_context_p,
@@ -122,15 +120,13 @@ def mock_kv_cache_seq_cp(
         ):
             # Test some basic invariants of this mocking technique
             assert ctx == llama._ctx.ctx, "context does not match mock_llama"
-            return
-    
+
         def mock_kv_cache_seq_keep(
             ctx: llama_cpp.llama_context_p,
             seq_id: llama_cpp.llama_seq_id,
         ):
             # Test some basic invariants of this mocking technique
             assert ctx == llama._ctx.ctx, "context does not match mock_llama"
-            return
 
         def mock_kv_cache_seq_add(
             ctx: llama_cpp.llama_context_p,
@@ -140,7 +136,6 @@ def mock_kv_cache_seq_add(
         ):
             # Test some basic invariants of this mocking technique
             assert ctx == llama._ctx.ctx, "context does not match mock_llama"
-            return
 
         monkeypatch.setattr("llama_cpp.llama_cpp.llama_kv_cache_clear", mock_kv_cache_clear)
         monkeypatch.setattr("llama_cpp.llama_cpp.llama_kv_cache_seq_rm", mock_kv_cache_seq_rm)
@@ -244,7 +239,8 @@ def test_utf8(mock_llama):
 
 def test_llama_server():
     from fastapi.testclient import TestClient
-    from llama_cpp.server.app import create_app, Settings
+
+    from llama_cpp.server.app import Settings, create_app
 
     settings = Settings(
         model=MODEL,
diff --git a/tests/test_llama_chat_format.py b/tests/test_llama_chat_format.py
index f031bf72b..0379ba4d1 100644
--- a/tests/test_llama_chat_format.py
+++ b/tests/test_llama_chat_format.py
@@ -4,12 +4,12 @@
 
 from llama_cpp import (
     ChatCompletionRequestUserMessage,
+    llama_chat_format,
+    llama_types,
 )
-import llama_cpp.llama_types as llama_types
-import llama_cpp.llama_chat_format as llama_chat_format
-
 from llama_cpp.llama_chat_format import hf_tokenizer_config_to_chat_formatter
 
+
 def test_mistral_instruct():
     chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
     chat_formatter = jinja2.Template(chat_template)
diff --git a/tests/test_llama_grammar.py b/tests/test_llama_grammar.py
index cb221880a..979400119 100644
--- a/tests/test_llama_grammar.py
+++ b/tests/test_llama_grammar.py
@@ -1,6 +1,7 @@
-import llama_cpp
 import json
 
+import llama_cpp
+
 tree = """
 leaf ::= "."
 node ::= leaf | "(" node node ")"
@@ -75,4 +76,4 @@ def test_grammar_anyof():
 
     grammar = llama_cpp.LlamaGrammar.from_json_schema(json.dumps(sch))
 
-    assert grammar.grammar is not None
\ No newline at end of file
+    assert grammar.grammar is not None
diff --git a/tests/test_llama_speculative.py b/tests/test_llama_speculative.py
index b5d450567..63e8adbb3 100644
--- a/tests/test_llama_speculative.py
+++ b/tests/test_llama_speculative.py
@@ -2,6 +2,7 @@
 
 from llama_cpp.llama_speculative import LlamaPromptLookupDecoding
 
+
 def test_find_candidate_pred_tokens():
     find_candidate_pred_tokens = LlamaPromptLookupDecoding.find_candidate_pred_tokens
 

From d8d9c4d6781e5600ac66b85b07630898de42bdd5 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 07:26:17 +0200
Subject: [PATCH 146/177] Lint E711

---
 examples/low_level_api/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/low_level_api/common.py b/examples/low_level_api/common.py
index 7ec141f2a..5c3214921 100644
--- a/examples/low_level_api/common.py
+++ b/examples/low_level_api/common.py
@@ -377,7 +377,7 @@ def gpt_params_parse(argv=None):
     if params.lora_adapter:
         params.use_mmap = False
 
-    if logit_bias_str != None:
+    if logit_bias_str is not None:
         for i in logit_bias_str:
             if m := re.match(r"(\d+)([-+]\d+)", i):
                 params.logit_bias[int(m.group(1))] = float(m.group(2))

From f747d463e096cc0889f0f63071a81fa87af4f6bc Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 07:33:45 +0200
Subject: [PATCH 147/177] Update llama_types.py

---
 llama_cpp/llama_types.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py
index 9c9969b59..7d5a17bcb 100644
--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@@ -7,9 +7,9 @@
 
 """
 
-from typing import Any, Dict, List, Literal, NotRequired, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
-from typing_extensions import TypedDict
+from typing_extensions import Literal, NotRequired, TypedDict
 
 # NOTE: Defining this correctly using annotations seems to break pydantic validation.
 #       This is a workaround until we can figure out how to do this correctly

From 91ce8acd124cbec0859c43eed9fd7a36c921d18d Mon Sep 17 00:00:00 2001
From: Smartappli <Smartappli@users.noreply.github.com>
Date: Thu, 15 Aug 2024 05:33:59 +0000
Subject: [PATCH 148/177] style fixes by ruff

---
 llama_cpp/llama_types.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py
index 7d5a17bcb..9c9969b59 100644
--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@@ -7,9 +7,9 @@
 
 """
 
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Literal, NotRequired, Optional, Union
 
-from typing_extensions import Literal, NotRequired, TypedDict
+from typing_extensions import TypedDict
 
 # NOTE: Defining this correctly using annotations seems to break pydantic validation.
 #       This is a workaround until we can figure out how to do this correctly

From e33d41bf42c9d1934477138f12f82def7276c84a Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 07:37:38 +0200
Subject: [PATCH 149/177] Delete .github/workflows/fixer.yml

---
 .github/workflows/fixer.yml | 34 ----------------------------------
 1 file changed, 34 deletions(-)
 delete mode 100644 .github/workflows/fixer.yml

diff --git a/.github/workflows/fixer.yml b/.github/workflows/fixer.yml
deleted file mode 100644
index 3c451f05a..000000000
--- a/.github/workflows/fixer.yml
+++ /dev/null
@@ -1,34 +0,0 @@
-name: Fixer
-
-on: [push, pull_request]
-
-concurrency:
-  group: fixer-${{ github.event_name == 'pull_request' && format('{0}-{1}', github.workflow, github.event.pull_request.number) || github.workflow_ref }}
-  cancel-in-progress: true
-
-jobs:
-  ruff-lint:
-    name: Ruff
-    runs-on: ubuntu-latest
-    permissions:
-      # Give the default GITHUB_TOKEN write permission to commit and push the
-      # added or changed files to the repository.
-      contents: write
-
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          ref: ${{ github.head_ref }}
-  
-      - uses: actions/setup-python@v5
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-      
-      - uses: chartboost/ruff-action@v1
-        with:
-          args: 'check --preview --fix-only'
-
-      - uses: stefanzweifel/git-auto-commit-action@v5
-        with:
-          commit_message: 'style fixes by ruff'

From 03f80077d5c8d02eaf1147ea7b05481ebfbca3e6 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 07:41:22 +0200
Subject: [PATCH 150/177] Update settings.py

---
 llama_cpp/server/settings.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
index 5f5747265..03df095d7 100644
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@@ -1,7 +1,8 @@
 from __future__ import annotations
 
 import multiprocessing
-from typing import Dict, List, Literal, Optional, Self, Union, cast
+from typing import Dict, List, Literal, Optional, Union, cast
+from typing_extensions import Self
 
 from pydantic import Field, model_validator
 from pydantic_settings import BaseSettings

From 85b1930c8a8b6139bfb4272b1d3e6396f81067c3 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 07:44:44 +0200
Subject: [PATCH 151/177] Update llama_types.py

---
 llama_cpp/llama_types.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py
index 9c9969b59..b40739289 100644
--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@@ -7,9 +7,9 @@
 
 """
 
-from typing import Any, Dict, List, Literal, NotRequired, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
-from typing_extensions import TypedDict
+from typing_extensions import NotRequired, Literal, TypedDict
 
 # NOTE: Defining this correctly using annotations seems to break pydantic validation.
 #       This is a workaround until we can figure out how to do this correctly

From a9a75581bb5ecea8a9ff54d32b1edb675bd06573 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 07:48:02 +0200
Subject: [PATCH 152/177] Update llama_cpp.py

---
 llama_cpp/llama_cpp.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 40d91874e..0ead46aa1 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -13,11 +13,10 @@
     List,
     NewType,
     Optional,
-    TypeAlias,
     TypeVar,
     Union,
 )
-
+from typing_extensions import TypeAlias
 
 # Load the library
 def _load_shared_library(lib_base_name: str):

From b58c8b5da2a83ac969ee00ebb3899ec33efa2a19 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 07:52:58 +0200
Subject: [PATCH 153/177] Update ruff.toml

---
 ruff.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ruff.toml b/ruff.toml
index de9934d7e..672658e2e 100644
--- a/ruff.toml
+++ b/ruff.toml
@@ -33,8 +33,8 @@ exclude = [
 line-length = 120
 indent-width = 4
 
-# Assume Python 3.12
-target-version = "py312"
+# Assume Python 3.9
+target-version = "py39"
 
 [lint]
 preview = true

From 683868b213cb3d954e0212717d7e9293a4660ab1 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 07:54:41 +0200
Subject: [PATCH 154/177] Update llama_types.py

---
 llama_cpp/llama_types.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py
index b40739289..98585c5af 100644
--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@@ -25,7 +25,7 @@ class EmbeddingUsage(TypedDict):
 class Embedding(TypedDict):
     index: int
     object: str
-    embedding: list[float] | list[list[float]]
+    embedding: Union[List[float], List[List[float]]]
 
 
 class CreateEmbeddingResponse(TypedDict):

From a5f16dcb3f1d7061e38040450ac0231eca05c00d Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 08:00:50 +0200
Subject: [PATCH 155/177] Update llama_types.py

---
 llama_cpp/llama_types.py | 68 +++++++++++++++++++++-------------------
 1 file changed, 35 insertions(+), 33 deletions(-)

diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py
index 98585c5af..84bde6ef3 100644
--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@@ -9,12 +9,12 @@
 
 from typing import Any, Dict, List, Optional, Union
 
-from typing_extensions import NotRequired, Literal, TypedDict
+from typing_extensions import Literal, NotRequired, TypedDict
 
 # NOTE: Defining this correctly using annotations seems to break pydantic validation.
 #       This is a workaround until we can figure out how to do this correctly
 # JsonType = Union[None, int, str, bool, List["JsonType"], Dict[str, "JsonType"]]
-JsonType = Union[None, int, str, bool, list[Any], dict[str, Any]]
+JsonType = Union[None, int, str, bool, List[Any], Dict[str, Any]]
 
 
 class EmbeddingUsage(TypedDict):
@@ -31,22 +31,22 @@ class Embedding(TypedDict):
 class CreateEmbeddingResponse(TypedDict):
     object: Literal["list"]
     model: str
-    data: list[Embedding]
+    data: List[Embedding]
     usage: EmbeddingUsage
 
 
 class CompletionLogprobs(TypedDict):
-    text_offset: list[int]
-    token_logprobs: list[float | None]
-    tokens: list[str]
-    top_logprobs: list[dict[str, float] | None]
+    text_offset: List[int]
+    token_logprobs: List[Optional[float]]
+    tokens: List[str]
+    top_logprobs: List[Optional[Dict[str, float]]]
 
 
 class CompletionChoice(TypedDict):
     text: str
     index: int
-    logprobs: CompletionLogprobs | None
-    finish_reason: Literal["stop", "length"] | None
+    logprobs: Optional[CompletionLogprobs]
+    finish_reason: Optional[Literal["stop", "length"]]
 
 
 class CompletionUsage(TypedDict):
@@ -60,7 +60,7 @@ class CreateCompletionResponse(TypedDict):
     object: Literal["text_completion"]
     created: int
     model: str
-    choices: list[CompletionChoice]
+    choices: List[CompletionChoice]
     usage: NotRequired[CompletionUsage]
 
 
@@ -70,7 +70,7 @@ class ChatCompletionResponseFunctionCall(TypedDict):
 
 
 class ChatCompletionResponseMessage(TypedDict):
-    content: str | None
+    content: Optional[str]
     tool_calls: NotRequired["ChatCompletionMessageToolCalls"]
     role: Literal["assistant", "function"]  # NOTE: "function" may be incorrect here
     function_call: NotRequired[ChatCompletionResponseFunctionCall]  # DEPRECATED
@@ -79,14 +79,14 @@ class ChatCompletionResponseMessage(TypedDict):
 class ChatCompletionFunction(TypedDict):
     name: str
     description: NotRequired[str]
-    parameters: dict[str, JsonType]  # TODO: make this more specific
+    parameters: Dict[str, JsonType]  # TODO: make this more specific
 
 
 class ChatCompletionResponseChoice(TypedDict):
     index: int
     message: "ChatCompletionResponseMessage"
-    logprobs: CompletionLogprobs | None
-    finish_reason: str | None
+    logprobs: Optional[CompletionLogprobs]
+    finish_reason: Optional[str]
 
 
 class CreateChatCompletionResponse(TypedDict):
@@ -94,12 +94,12 @@ class CreateChatCompletionResponse(TypedDict):
     object: Literal["chat.completion"]
     created: int
     model: str
-    choices: list["ChatCompletionResponseChoice"]
+    choices: List["ChatCompletionResponseChoice"]
     usage: CompletionUsage
 
 
 class ChatCompletionMessageToolCallChunkFunction(TypedDict):
-    name: str | None
+    name: Optional[str]
     arguments: str
 
 
@@ -120,19 +120,21 @@ class ChatCompletionStreamResponseDeltaFunctionCall(TypedDict):
 
 
 class ChatCompletionStreamResponseDelta(TypedDict):
-    content: NotRequired[str | None]
+    content: NotRequired[Optional[str]]
     function_call: NotRequired[
-        ChatCompletionStreamResponseDeltaFunctionCall | None
+        Optional[ChatCompletionStreamResponseDeltaFunctionCall]
     ]  # DEPRECATED
-    tool_calls: NotRequired[list[ChatCompletionMessageToolCallChunk] | None]
-    role: NotRequired[Literal["system", "user", "assistant", "tool"] | None]
+    tool_calls: NotRequired[Optional[List[ChatCompletionMessageToolCallChunk]]]
+    role: NotRequired[Optional[Literal["system", "user", "assistant", "tool"]]]
 
 
 class ChatCompletionStreamResponseChoice(TypedDict):
     index: int
-    delta: ChatCompletionStreamResponseDelta | ChatCompletionStreamResponseDeltaEmpty
-    finish_reason: Literal["stop", "length", "tool_calls", "function_call"] | None
-    logprobs: NotRequired[CompletionLogprobs | None]
+    delta: Union[
+        ChatCompletionStreamResponseDelta, ChatCompletionStreamResponseDeltaEmpty,
+    ]
+    finish_reason: Optional[Literal["stop", "length", "tool_calls", "function_call"]]
+    logprobs: NotRequired[Optional[CompletionLogprobs]]
 
 
 class CreateChatCompletionStreamResponse(TypedDict):
@@ -140,13 +142,13 @@ class CreateChatCompletionStreamResponse(TypedDict):
     model: str
     object: Literal["chat.completion.chunk"]
     created: int
-    choices: list[ChatCompletionStreamResponseChoice]
+    choices: List[ChatCompletionStreamResponseChoice]
 
 
 class ChatCompletionFunctions(TypedDict):
     name: str
     description: NotRequired[str]
-    parameters: dict[str, JsonType]  # TODO: make this more specific
+    parameters: Dict[str, JsonType]  # TODO: make this more specific
 
 
 class ChatCompletionFunctionCallOption(TypedDict):
@@ -172,7 +174,7 @@ class ChatCompletionRequestMessageContentPartImageImageUrl(TypedDict):
 
 class ChatCompletionRequestMessageContentPartImage(TypedDict):
     type: Literal["image_url"]
-    image_url: str | ChatCompletionRequestMessageContentPartImageImageUrl
+    image_url: Union[str, ChatCompletionRequestMessageContentPartImageImageUrl]
 
 
 ChatCompletionRequestMessageContentPart = Union[
@@ -183,12 +185,12 @@ class ChatCompletionRequestMessageContentPartImage(TypedDict):
 
 class ChatCompletionRequestSystemMessage(TypedDict):
     role: Literal["system"]
-    content: str | None
+    content: Optional[str]
 
 
 class ChatCompletionRequestUserMessage(TypedDict):
     role: Literal["user"]
-    content: str | list[ChatCompletionRequestMessageContentPart] | None
+    content: Optional[Union[str, List[ChatCompletionRequestMessageContentPart]]]
 
 
 class ChatCompletionMessageToolCallFunction(TypedDict):
@@ -202,7 +204,7 @@ class ChatCompletionMessageToolCall(TypedDict):
     function: ChatCompletionMessageToolCallFunction
 
 
-ChatCompletionMessageToolCalls = list[ChatCompletionMessageToolCall]
+ChatCompletionMessageToolCalls = List[ChatCompletionMessageToolCall]
 
 
 class ChatCompletionRequestAssistantMessageFunctionCall(TypedDict):
@@ -212,7 +214,7 @@ class ChatCompletionRequestAssistantMessageFunctionCall(TypedDict):
 
 class ChatCompletionRequestAssistantMessage(TypedDict):
     role: Literal["assistant"]
-    content: str | None
+    content: Optional[str]
     tool_calls: NotRequired[ChatCompletionMessageToolCalls]
     function_call: NotRequired[
         ChatCompletionRequestAssistantMessageFunctionCall
@@ -221,13 +223,13 @@ class ChatCompletionRequestAssistantMessage(TypedDict):
 
 class ChatCompletionRequestToolMessage(TypedDict):
     role: Literal["tool"]
-    content: str | None
+    content: Optional[str]
     tool_call_id: str
 
 
 class ChatCompletionRequestFunctionMessage(TypedDict):
     role: Literal["function"]
-    content: str | None
+    content: Optional[str]
     name: str
 
 
@@ -249,7 +251,7 @@ class ChatCompletionRequestFunctionCallOption(TypedDict):
     Literal["none", "auto"], ChatCompletionRequestFunctionCallOption,
 ]
 
-ChatCompletionFunctionParameters = dict[str, JsonType]  # TODO: make this more specific
+ChatCompletionFunctionParameters = Dict[str, JsonType]  # TODO: make this more specific
 
 
 class ChatCompletionToolFunction(TypedDict):

From a35e3f2f4715db40f6d3d99ffeb9c8fee4927320 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 08:04:38 +0200
Subject: [PATCH 156/177] Update llama_cache.py

---
 llama_cpp/llama_cache.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/llama_cpp/llama_cache.py b/llama_cpp/llama_cache.py
index 7bf4084db..8a9f18f37 100644
--- a/llama_cpp/llama_cache.py
+++ b/llama_cpp/llama_cache.py
@@ -1,9 +1,9 @@
 import sys
 from abc import ABC, abstractmethod
 from collections import OrderedDict
-from collections.abc import Sequence
 from typing import (
     Optional,
+    Sequence,
     Tuple,
 )
 
@@ -27,8 +27,8 @@ def cache_size(self) -> int:
 
     def _find_longest_prefix_key(
         self,
-        key: tuple[int, ...],
-    ) -> tuple[int, ...] | None:
+        key: Tuple[int, ...],
+    ) -> Optional[Tuple[int, ...]]:
         pass
 
     @abstractmethod
@@ -52,7 +52,7 @@ class LlamaRAMCache(BaseLlamaCache):
     def __init__(self, capacity_bytes: int = (2 << 30)):
         super().__init__(capacity_bytes)
         self.capacity_bytes = capacity_bytes
-        self.cache_state: OrderedDict[tuple[int, ...], llama_cpp.llama.LlamaState] = (
+        self.cache_state: OrderedDict[Tuple[int, ...], llama_cpp.llama.LlamaState] = (
             OrderedDict()
         )
 
@@ -62,8 +62,8 @@ def cache_size(self):
 
     def _find_longest_prefix_key(
         self,
-        key: tuple[int, ...],
-    ) -> tuple[int, ...] | None:
+        key: Tuple[int, ...],
+    ) -> Optional[Tuple[int, ...]]:
         min_len = 0
         min_key = None
         keys = (
@@ -116,10 +116,10 @@ def cache_size(self):
 
     def _find_longest_prefix_key(
         self,
-        key: tuple[int, ...],
-    ) -> tuple[int, ...] | None:
+        key: Tuple[int, ...],
+    ) -> Optional[Tuple[int, ...]]:
         min_len = 0
-        min_key: tuple[int, ...] | None = None
+        min_key: Optional[Tuple[int, ...]] = None
         for k in self.cache.iterkeys():  # type: ignore
             prefix_len = llama_cpp.llama.Llama.longest_token_prefix(k, key)
             if prefix_len > min_len:

From ca3fc20e7decf14c4cf52569cd05d73474c2dac0 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 08:10:05 +0200
Subject: [PATCH 157/177] Update llama_cpp.py

---
 llama_cpp/llama_cpp.py | 452 ++++++++++++++++++++++++++---------------
 1 file changed, 292 insertions(+), 160 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 0ead46aa1..0d30d6be9 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -5,10 +5,10 @@
 import os
 import pathlib
 import sys
-from collections.abc import Callable
 from typing import (
     TYPE_CHECKING,
     Any,
+    Callable,
     Generic,
     List,
     NewType,
@@ -16,15 +16,17 @@
     TypeVar,
     Union,
 )
+
 from typing_extensions import TypeAlias
 
+
 # Load the library
 def _load_shared_library(lib_base_name: str):
     # Construct the paths to the possible shared library names
     _base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib"
     # Searching for the library in the current directory under the name "libllama" (default name
     # for llamacpp) and "llama" (default name for this repo)
-    _lib_paths: list[pathlib.Path] = []
+    _lib_paths: List[pathlib.Path] = []
     # Determine the file extension based on the platform
     if sys.platform.startswith("linux") or sys.platform.startswith("freebsd"):
         _lib_paths += [
@@ -75,7 +77,7 @@ def _load_shared_library(lib_base_name: str):
                 raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}")
 
     raise FileNotFoundError(
-        f"Shared library with base name '{lib_base_name}' not found",
+        f"Shared library with base name '{lib_base_name}' not found"
     )
 
 
@@ -106,7 +108,9 @@ def _load_shared_library(lib_base_name: str):
     class CtypesRef(Generic[CtypesCData]):
         pass
 
-    CtypesPointerOrRef: TypeAlias = CtypesPointer[CtypesCData] | CtypesRef[CtypesCData]
+    CtypesPointerOrRef: TypeAlias = Union[
+        CtypesPointer[CtypesCData], CtypesRef[CtypesCData]
+    ]
 
     CtypesFuncPointer: TypeAlias = ctypes._FuncPointer  # type: ignore
 
@@ -115,7 +119,7 @@ class CtypesRef(Generic[CtypesCData]):
 
 def ctypes_function_for_shared_library(lib: ctypes.CDLL):
     def ctypes_function(
-        name: str, argtypes: list[Any], restype: Any, enabled: bool = True,
+        name: str, argtypes: List[Any], restype: Any, enabled: bool = True
     ):
         def decorator(f: F) -> F:
             if enabled:
@@ -124,8 +128,8 @@ def decorator(f: F) -> F:
                 func.restype = restype
                 functools.wraps(f)(func)
                 return func
-
-            return f
+            else:
+                return f
 
         return decorator
 
@@ -135,8 +139,9 @@ def decorator(f: F) -> F:
 ctypes_function = ctypes_function_for_shared_library(_lib)
 
 
-def byref(obj: CtypesCData, offset: int | None = None) -> CtypesRef[CtypesCData]:
+def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCData]:
     """Type-annotated version of ctypes.byref"""
+    ...
 
 
 byref = ctypes.byref  # type: ignore
@@ -209,7 +214,7 @@ def byref(obj: CtypesCData, offset: int | None = None) -> CtypesRef[CtypesCData]
 # from ggml-backend.h
 # typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
 ggml_backend_sched_eval_callback = ctypes.CFUNCTYPE(
-    ctypes.c_bool, ctypes.c_void_p, ctypes.c_bool, ctypes.c_void_p,
+    ctypes.c_bool, ctypes.c_void_p, ctypes.c_bool, ctypes.c_void_p
 )
 
 # // Abort callback
@@ -571,7 +576,7 @@ class llama_token_data_array(ctypes.Structure):
 
 # typedef bool (*llama_progress_callback)(float progress, void * user_data);
 llama_progress_callback = ctypes.CFUNCTYPE(
-    ctypes.c_bool, ctypes.c_float, ctypes.c_void_p,
+    ctypes.c_bool, ctypes.c_float, ctypes.c_void_p
 )
 
 
@@ -694,7 +699,7 @@ class llama_model_kv_override(ctypes.Structure):
     if TYPE_CHECKING:
         tag: int
         key: bytes
-        value: int | float | bool | bytes
+        value: Union[int, float, bool, bytes]
 
 
 # struct llama_model_params {
@@ -925,7 +930,7 @@ class llama_context_params(ctypes.Structure):
 # // It might not exist for progress report where '.' is output repeatedly.
 # typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
 llama_log_callback = ctypes.CFUNCTYPE(
-    None, ctypes.c_int, ctypes.c_char_p, ctypes.c_void_p,
+    None, ctypes.c_int, ctypes.c_char_p, ctypes.c_void_p
 )
 """Signature for logging events
 Note that text includes the new line character at the end for most events.
@@ -939,7 +944,7 @@ class llama_context_params(ctypes.Structure):
 #     int32_t nthread;                     // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
 #     enum llama_ftype ftype;              // quantize to this llama_ftype
 #     enum ggml_type output_tensor_type;   // output tensor type
-#     enum ggml_type token_embedding_type; // token embeddings tensor type
+#     enum ggml_type token_embedding_type; // itoken embeddings tensor type
 #     bool allow_requantize;               // allow quantizing non-f32/f16 tensors
 #     bool quantize_output_tensor;         // quantize output.weight
 #     bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
@@ -955,7 +960,7 @@ class llama_model_quantize_params(ctypes.Structure):
         nthread (int): number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
         ftype (int): quantize to this llama_ftype
         output_tensor_type (int): output tensor type
-        token_embedding_type (int): token embeddings tensor type
+        token_embedding_type (int): itoken embeddings tensor type
         allow_requantize (bool): allow quantizing non-f32/f16 tensors
         quantize_output_tensor (bool): quantize output.weight
         only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
@@ -1118,6 +1123,7 @@ class llama_chat_message(ctypes.Structure):
 )
 def llama_model_default_params() -> llama_model_params:
     """Get default parameters for llama_model"""
+    ...
 
 
 # LLAMA_API struct llama_context_params llama_context_default_params(void);
@@ -1128,6 +1134,7 @@ def llama_model_default_params() -> llama_model_params:
 )
 def llama_context_default_params() -> llama_context_params:
     """Get default parameters for llama_context"""
+    ...
 
 
 # LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
@@ -1138,6 +1145,7 @@ def llama_context_default_params() -> llama_context_params:
 )
 def llama_model_quantize_default_params() -> llama_model_quantize_params:
     """Get default parameters for llama_model_quantize"""
+    ...
 
 
 # // Initialize the llama + ggml backend
@@ -1154,6 +1162,7 @@ def llama_backend_init():
     """Initialize the llama + ggml backend
     If numa is true, use NUMA optimizations
     Call once at the start of the program"""
+    ...
 
 
 # // numa strategies
@@ -1193,6 +1202,7 @@ def llama_numa_init(numa: int, /):
 )
 def llama_backend_free():
     """Call once at the end of the program - currently only used for MPI"""
+    ...
 
 
 # LLAMA_API struct llama_model * llama_load_model_from_file(
@@ -1204,8 +1214,8 @@ def llama_backend_free():
     llama_model_p_ctypes,
 )
 def llama_load_model_from_file(
-    path_model: bytes, params: llama_model_params, /,
-) -> llama_model_p | None:
+    path_model: bytes, params: llama_model_params, /
+) -> Optional[llama_model_p]:
     ...
 
 
@@ -1228,8 +1238,8 @@ def llama_free_model(model: llama_model_p, /):
     llama_context_p_ctypes,
 )
 def llama_new_context_with_model(
-    model: llama_model_p, params: llama_context_params, /,
-) -> llama_context_p | None:
+    model: llama_model_p, params: llama_context_params, /
+) -> Optional[llama_context_p]:
     ...
 
 
@@ -1242,6 +1252,7 @@ def llama_new_context_with_model(
 )
 def llama_free(ctx: llama_context_p, /):
     """Frees all allocated memory"""
+    ...
 
 
 # LLAMA_API int64_t llama_time_us(void);
@@ -1280,7 +1291,7 @@ def llama_supports_gpu_offload() -> bool:
 
 # LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
 @ctypes_function("llama_get_model", [llama_context_p_ctypes], llama_model_p_ctypes)
-def llama_get_model(ctx: llama_context_p, /) -> llama_model_p | None:
+def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]:
     ...
 
 
@@ -1355,6 +1366,7 @@ def llama_n_layer(model: llama_model_p, /) -> int:
 @ctypes_function("llama_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float)
 def llama_rope_freq_scale_train(model: llama_model_p, /) -> float:
     """Get the model's RoPE frequency scaling factor"""
+    ...
 
 
 # // Functions to access the model's GGUF metadata scalar values
@@ -1377,12 +1389,13 @@ def llama_rope_freq_scale_train(model: llama_model_p, /) -> float:
 )
 def llama_model_meta_val_str(
     model: llama_model_p,
-    key: ctypes.c_char_p | bytes,
+    key: Union[ctypes.c_char_p, bytes],
     buf: bytes,
     buf_size: int,
     /,
 ) -> int:
     """Get metadata value as a string by key name"""
+    ...
 
 
 # // Get the number of metadata key/value pairs
@@ -1390,6 +1403,7 @@ def llama_model_meta_val_str(
 @ctypes_function("llama_model_meta_count", [llama_model_p_ctypes], ctypes.c_int32)
 def llama_model_meta_count(model: llama_model_p, /) -> int:
     """Get the number of metadata key/value pairs"""
+    ...
 
 
 # // Get metadata key name by index
@@ -1406,12 +1420,13 @@ def llama_model_meta_count(model: llama_model_p, /) -> int:
 )
 def llama_model_meta_key_by_index(
     model: llama_model_p,
-    i: ctypes.c_int | int,
-    buf: bytes | CtypesArray[ctypes.c_char],
+    i: Union[ctypes.c_int, int],
+    buf: Union[bytes, CtypesArray[ctypes.c_char]],
     buf_size: int,
     /,
 ) -> int:
     """Get metadata key name by index"""
+    ...
 
 
 # // Get metadata value as a string by index
@@ -1428,12 +1443,13 @@ def llama_model_meta_key_by_index(
 )
 def llama_model_meta_val_str_by_index(
     model: llama_model_p,
-    i: ctypes.c_int | int,
-    buf: bytes | CtypesArray[ctypes.c_char],
+    i: Union[ctypes.c_int, int],
+    buf: Union[bytes, CtypesArray[ctypes.c_char]],
     buf_size: int,
     /,
 ) -> int:
     """Get metadata value as a string by index"""
+    ...
 
 
 # // Get a string describing the model type
@@ -1445,11 +1461,12 @@ def llama_model_meta_val_str_by_index(
 )
 def llama_model_desc(
     model: llama_model_p,
-    buf: bytes | CtypesArray[ctypes.c_char],
-    buf_size: ctypes.c_size_t | int,
+    buf: Union[bytes, CtypesArray[ctypes.c_char]],
+    buf_size: Union[ctypes.c_size_t, int],
     /,
 ) -> int:
     """Get a string describing the model type"""
+    ...
 
 
 # // Returns the total size of all the tensors in the model in bytes
@@ -1457,6 +1474,7 @@ def llama_model_desc(
 @ctypes_function("llama_model_size", [llama_model_p_ctypes], ctypes.c_uint64)
 def llama_model_size(model: llama_model_p, /) -> int:
     """Returns the total size of all the tensors in the model in bytes"""
+    ...
 
 
 # // Returns the total number of parameters in the model
@@ -1464,17 +1482,19 @@ def llama_model_size(model: llama_model_p, /) -> int:
 @ctypes_function("llama_model_n_params", [llama_model_p_ctypes], ctypes.c_uint64)
 def llama_model_n_params(model: llama_model_p, /) -> int:
     """Returns the total number of parameters in the model"""
+    ...
 
 
 # // Get a llama model tensor
 # LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
 @ctypes_function(
-    "llama_get_model_tensor", [llama_model_p_ctypes, ctypes.c_char_p], ctypes.c_void_p,
+    "llama_get_model_tensor", [llama_model_p_ctypes, ctypes.c_char_p], ctypes.c_void_p
 )
 def llama_get_model_tensor(
-    model: llama_model_p, name: ctypes.c_char_p | bytes, /,
+    model: llama_model_p, name: Union[ctypes.c_char_p, bytes], /
 ) -> ctypes.c_void_p:
     """Get a llama model tensor"""
+    ...
 
 
 # // Returns true if the model contains an encoder that requires llama_encode() call
@@ -1482,25 +1502,20 @@ def llama_get_model_tensor(
 @ctypes_function("llama_model_has_encoder", [llama_model_p_ctypes], ctypes.c_bool)
 def llama_model_has_encoder(model: llama_model_p, /) -> bool:
     """Returns true if the model contains an encoder that requires llama_encode() call"""
-
-
-# // Returns true if the model contains a decoder that requires llama_decode() call
-# LLAMA_API bool llama_model_has_decoder(const struct llama_model * model);
-@ctypes_function("llama_model_has_decoder", [llama_model_p_ctypes], ctypes.c_bool)
-def llama_model_has_decoder(model: llama_model_p, /) -> bool:
-    """Returns true if the model contains a decoder that requires llama_decode() call"""
+    ...
 
 
 # // For encoder-decoder models, this function returns id of the token that must be provided
 # // to the decoder to start generating output sequence. For other models, it returns -1.
 # LLAMA_API llama_token llama_model_decoder_start_token(const struct llama_model * model);
 @ctypes_function(
-    "llama_model_decoder_start_token", [llama_model_p_ctypes], ctypes.c_int32,
+    "llama_model_decoder_start_token", [llama_model_p_ctypes], ctypes.c_int32
 )
 def llama_model_decoder_start_token(model: llama_model_p, /) -> int:
     """For encoder-decoder models, this function returns id of the token that must be provided
     to the decoder to start generating output sequence. For other models, it returns -1.
     """
+    ...
 
 
 # // Returns 0 on success
@@ -1524,6 +1539,7 @@ def llama_model_quantize(
     /,
 ) -> int:
     """Returns 0 on success"""
+    ...
 
 
 # // Load a LoRA adapter from file
@@ -1537,11 +1553,12 @@ def llama_model_quantize(
     llama_lora_adapter_p_ctypes,
 )
 def llama_lora_adapter_init(
-    model: llama_model_p, path_lora: bytes, /,
-) -> llama_lora_adapter_p | None:
+    model: llama_model_p, path_lora: bytes, /
+) -> Optional[llama_lora_adapter_p]:
     """Load a LoRA adapter from file
     The loaded adapter will be associated to the given model, and will be free when the model is deleted
     """
+    ...
 
 
 # // Add a loaded LoRA adapter to given context
@@ -1556,10 +1573,11 @@ def llama_lora_adapter_init(
     ctypes.c_int32,
 )
 def llama_lora_adapter_set(
-    ctx: llama_context_p, adapter: llama_lora_adapter_p, scale: float, /,
+    ctx: llama_context_p, adapter: llama_lora_adapter_p, scale: float, /
 ) -> int:
     """Add a loaded LoRA adapter to given context
     This will not modify model's weight"""
+    ...
 
 
 # // Remove a specific LoRA adapter from given context
@@ -1573,10 +1591,11 @@ def llama_lora_adapter_set(
     ctypes.c_int32,
 )
 def llama_lora_adapter_remove(
-    ctx: llama_context_p, adapter: llama_lora_adapter_p, /,
+    ctx: llama_context_p, adapter: llama_lora_adapter_p, /
 ) -> int:
     """Remove a LoRA adapter from given context
     Return -1 if the adapter is not present in the context"""
+    ...
 
 
 # // Remove all LoRA adapters from given context
@@ -1589,6 +1608,7 @@ def llama_lora_adapter_remove(
 )
 def llama_lora_adapter_clear(ctx: llama_context_p, /):
     """Remove all LoRA adapters from given context"""
+    ...
 
 
 # // Manually free a LoRA adapter
@@ -1602,6 +1622,7 @@ def llama_lora_adapter_clear(ctx: llama_context_p, /):
 def llama_lora_adapter_free(adapter: llama_lora_adapter_p, /):
     """Manually free a LoRA adapter
     Note: loaded adapters will be free when the associated model is deleted"""
+    ...
 
 
 # // Apply a loaded control vector to a llama_context, or if data is NULL, clear
@@ -1644,6 +1665,7 @@ def llama_control_vector_apply(
     to an n_embd x n_layers buffer starting from layer 1.
     il_start and il_end are the layer range the vector should apply to (both inclusive)
     See llama_control_vector_load in common to load a control vector."""
+    ...
 
 
 # //
@@ -1736,9 +1758,10 @@ class llama_kv_cache_view(ctypes.Structure):
     llama_kv_cache_view,
 )
 def llama_kv_cache_view_init(
-    ctx: llama_context_p, n_seq_max: ctypes.c_int32 | int, /,
+    ctx: llama_context_p, n_seq_max: Union[ctypes.c_int32, int], /
 ) -> llama_kv_cache_view:
     """Create an empty KV cache view. (use only for debugging purposes)"""
+    ...
 
 
 # // Free a KV cache view. (use only for debugging purposes)
@@ -1746,36 +1769,40 @@ def llama_kv_cache_view_init(
 @ctypes_function("llama_kv_cache_view_free", [llama_kv_cache_view_p], None)
 def llama_kv_cache_view_free(view: ctypes.pointer[llama_kv_cache_view], /):  # type: ignore
     """Free a KV cache view. (use only for debugging purposes)"""
+    ...
 
 
 # // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
 # LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
 @ctypes_function(
-    "llama_kv_cache_view_update", [llama_context_p_ctypes, llama_kv_cache_view_p], None,
+    "llama_kv_cache_view_update", [llama_context_p_ctypes, llama_kv_cache_view_p], None
 )
 def llama_kv_cache_view_update(ctx: llama_context_p, view: CtypesPointerOrRef[llama_kv_cache_view], /):  # type: ignore
     """Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)"""
+    ...
 
 
 # // Returns the number of tokens in the KV cache (slow, use only for debug)
 # // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
 # LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
 @ctypes_function(
-    "llama_get_kv_cache_token_count", [llama_context_p_ctypes], ctypes.c_int32,
+    "llama_get_kv_cache_token_count", [llama_context_p_ctypes], ctypes.c_int32
 )
 def llama_get_kv_cache_token_count(ctx: llama_context_p, /) -> int:
     """Returns the number of tokens in the KV cache (slow, use only for debug)
     If a KV cell has multiple sequences assigned to it, it will be counted multiple times
     """
+    ...
 
 
 # // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
 # LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
 @ctypes_function(
-    "llama_get_kv_cache_used_cells", [llama_context_p_ctypes], ctypes.c_int32,
+    "llama_get_kv_cache_used_cells", [llama_context_p_ctypes], ctypes.c_int32
 )
 def llama_get_kv_cache_used_cells(ctx: llama_context_p, /) -> int:
     """Returns the number of used KV cells (i.e. have at least one sequence assigned to them)"""
+    ...
 
 
 # // Clear the KV cache - both cell info is erased and KV data is zeroed
@@ -1784,6 +1811,7 @@ def llama_get_kv_cache_used_cells(ctx: llama_context_p, /) -> int:
 @ctypes_function("llama_kv_cache_clear", [llama_context_p_ctypes], None)
 def llama_kv_cache_clear(ctx: llama_context_p, /):
     """Clear the KV cache"""
+    ...
 
 
 # // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
@@ -1808,9 +1836,9 @@ def llama_kv_cache_clear(ctx: llama_context_p, /):
 )
 def llama_kv_cache_seq_rm(
     ctx: llama_context_p,
-    seq_id: llama_seq_id | int,
-    p0: llama_pos | int,
-    p1: llama_pos | int,
+    seq_id: Union[llama_seq_id, int],
+    p0: Union[llama_pos, int],
+    p1: Union[llama_pos, int],
     /,
 ) -> bool:
     """Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
@@ -1820,6 +1848,7 @@ def llama_kv_cache_seq_rm(
     seq_id < 0 : match any sequence
     p0 < 0     : [0,  p1]
     p1 < 0     : [p0, inf)"""
+    ...
 
 
 # // Copy all tokens that belong to the specified sequence to another sequence
@@ -1845,16 +1874,17 @@ def llama_kv_cache_seq_rm(
 )
 def llama_kv_cache_seq_cp(
     ctx: llama_context_p,
-    seq_id_src: llama_seq_id | int,
-    seq_id_dst: llama_seq_id | int,
-    p0: llama_pos | int,
-    p1: llama_pos | int,
+    seq_id_src: Union[llama_seq_id, int],
+    seq_id_dst: Union[llama_seq_id, int],
+    p0: Union[llama_pos, int],
+    p1: Union[llama_pos, int],
     /,
 ):
     """Copy all tokens that belong to the specified sequence to another sequence
     Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
     p0 < 0 : [0,  p1]
     p1 < 0 : [p0, inf)"""
+    ...
 
 
 # // Removes all tokens that do not belong to the specified sequence
@@ -1862,10 +1892,11 @@ def llama_kv_cache_seq_cp(
 #         struct llama_context * ctx,
 #                 llama_seq_id   seq_id);
 @ctypes_function(
-    "llama_kv_cache_seq_keep", [llama_context_p_ctypes, llama_seq_id], None,
+    "llama_kv_cache_seq_keep", [llama_context_p_ctypes, llama_seq_id], None
 )
-def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: llama_seq_id | int, /):
+def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /):
     """Removes all tokens that do not belong to the specified sequence"""
+    ...
 
 
 # // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
@@ -1893,10 +1924,10 @@ def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: llama_seq_id | int, /)
 )
 def llama_kv_cache_seq_add(
     ctx: llama_context_p,
-    seq_id: llama_seq_id | int,
-    p0: llama_pos | int,
-    p1: llama_pos | int,
-    delta: llama_pos | int,
+    seq_id: Union[llama_seq_id, int],
+    p0: Union[llama_pos, int],
+    p1: Union[llama_pos, int],
+    delta: Union[llama_pos, int],
     /,
 ):
     """Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
@@ -1905,6 +1936,7 @@ def llama_kv_cache_seq_add(
     - explicitly with llama_kv_cache_update()
     p0 < 0 : [0,  p1]
     p1 < 0 : [p0, inf)"""
+    ...
 
 
 # // Integer division of the positions by factor of `d > 1`
@@ -1930,16 +1962,17 @@ def llama_kv_cache_seq_add(
 )
 def llama_kv_cache_seq_div(
     ctx: llama_context_p,
-    seq_id: llama_seq_id | int,
-    p0: llama_pos | int,
-    p1: llama_pos | int,
-    d: ctypes.c_int | int,
+    seq_id: Union[llama_seq_id, int],
+    p0: Union[llama_pos, int],
+    p1: Union[llama_pos, int],
+    d: Union[ctypes.c_int, int],
     /,
 ):
     """Integer division of the positions by factor of `d > 1`
     If the KV cache is RoPEd, the KV data is updated accordingly
     p0 < 0 : [0,  p1]
     p1 < 0 : [p0, inf)"""
+    ...
 
 
 # // Defragment the KV cache
@@ -1953,6 +1986,7 @@ def llama_kv_cache_defrag(ctx: llama_context_p, /):
     This will be applied:
     - lazily on next llama_decode()
     - explicitly with llama_kv_cache_update()"""
+    ...
 
 
 # // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
@@ -1960,6 +1994,7 @@ def llama_kv_cache_defrag(ctx: llama_context_p, /):
 @ctypes_function("llama_kv_cache_update", [llama_context_p_ctypes], None)
 def llama_kv_cache_update(ctx: llama_context_p, /):
     """Apply the KV cache updates (such as K-shifts, defragmentation, etc.)"""
+    ...
 
 
 # //
@@ -1974,6 +2009,7 @@ def llama_kv_cache_update(ctx: llama_context_p, /):
 @ctypes_function("llama_state_get_size", [llama_context_p_ctypes], ctypes.c_size_t)
 def llama_state_get_size(ctx: llama_context_p, /) -> int:
     """Returns the *actual* size in bytes of the state (rng, logits, embedding and kv_cache) - will often be smaller after compacting tokens"""
+    ...
 
 
 # LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx),
@@ -1982,6 +2018,7 @@ def llama_state_get_size(ctx: llama_context_p, /) -> int:
 def llama_get_state_size(ctx: llama_context_p, /) -> int:
     """Returns the maximum size in bytes of the state (rng, logits, embedding
     and kv_cache) - will often be smaller after compacting tokens"""
+    ...
 
 
 # // Copies the state to the specified destination address.
@@ -2003,12 +2040,13 @@ def llama_get_state_size(ctx: llama_context_p, /) -> int:
 def llama_state_get_data(
     ctx: llama_context_p,
     dst: CtypesArray[ctypes.c_uint8],
-    size: ctypes.c_size_t | int,
+    size: Union[ctypes.c_size_t, int],
     /,
 ) -> int:
     """Copies the state to the specified destination address.
     Destination needs to have allocated enough memory.
     Returns the number of bytes copied"""
+    ...
 
 
 # LLAMA_API DEPRECATED(size_t llama_copy_state_data(
@@ -2024,11 +2062,12 @@ def llama_state_get_data(
     ctypes.c_size_t,
 )
 def llama_copy_state_data(
-    ctx: llama_context_p, dst: CtypesArray[ctypes.c_uint8], /,
+    ctx: llama_context_p, dst: CtypesArray[ctypes.c_uint8], /
 ) -> int:
     """Copies the state to the specified destination address.
     Destination needs to have allocated enough memory.
     Returns the number of bytes copied"""
+    ...
 
 
 # // Set the state reading from the specified address
@@ -2045,11 +2084,12 @@ def llama_copy_state_data(
 def llama_state_set_data(
     ctx: llama_context_p,
     src: CtypesArray[ctypes.c_uint8],
-    size: ctypes.c_size_t | int,
+    size: Union[ctypes.c_size_t, int],
     /,
 ) -> int:
     """Set the state reading from the specified address
     Returns the number of bytes read"""
+    ...
 
 
 # LLAMA_API DEPRECATED(size_t llama_set_state_data(
@@ -2062,9 +2102,10 @@ def llama_state_set_data(
     ctypes.c_size_t,
 )
 def llama_set_state_data(
-    ctx: llama_context_p, src: CtypesArray[ctypes.c_uint8], /,
+    ctx: llama_context_p, src: CtypesArray[ctypes.c_uint8], /
 ) -> int:
     """Set the state reading from the specified address"""
+    ...
 
 
 # Save/load session file
@@ -2089,7 +2130,7 @@ def llama_state_load_file(
     ctx: llama_context_p,
     path_session: bytes,
     tokens_out: CtypesArray[llama_token],
-    n_token_capacity: ctypes.c_size_t | int,
+    n_token_capacity: Union[ctypes.c_size_t, int],
     n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t],
     /,
 ) -> bool:
@@ -2118,7 +2159,7 @@ def llama_load_session_file(
     ctx: llama_context_p,
     path_session: bytes,
     tokens_out: CtypesArray[llama_token],
-    n_token_capacity: ctypes.c_size_t | int,
+    n_token_capacity: Union[ctypes.c_size_t, int],
     n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t],
     /,
 ) -> int:
@@ -2144,7 +2185,7 @@ def llama_state_save_file(
     ctx: llama_context_p,
     path_session: bytes,
     tokens: CtypesArray[llama_token],
-    n_token_count: ctypes.c_size_t | int,
+    n_token_count: Union[ctypes.c_size_t, int],
     /,
 ) -> bool:
     ...
@@ -2170,7 +2211,7 @@ def llama_save_session_file(
     ctx: llama_context_p,
     path_session: bytes,
     tokens: CtypesArray[llama_token],
-    n_token_count: ctypes.c_size_t | int,
+    n_token_count: Union[ctypes.c_size_t, int],
     /,
 ) -> int:
     ...
@@ -2187,6 +2228,7 @@ def llama_save_session_file(
 )
 def llama_state_seq_get_size(ctx: llama_context_p, seq_id: llama_seq_id, /) -> int:
     """Get the exact size needed to copy the KV cache of a single sequence"""
+    ...
 
 
 # // Copy the KV cache of a single sequence into the specified buffer
@@ -2208,11 +2250,12 @@ def llama_state_seq_get_size(ctx: llama_context_p, seq_id: llama_seq_id, /) -> i
 def llama_state_seq_get_data(
     ctx: llama_context_p,
     dst: CtypesArray[ctypes.c_uint8],
-    size: ctypes.c_size_t | int,
+    size: Union[ctypes.c_size_t, int],
     seq_id: llama_seq_id,
     /,
 ) -> int:
     """Copy the KV cache of a single sequence into the specified buffer"""
+    ...
 
 
 # // Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence
@@ -2237,11 +2280,12 @@ def llama_state_seq_get_data(
 def llama_state_seq_set_data(
     ctx: llama_context_p,
     src: CtypesArray[ctypes.c_uint8],
-    size: ctypes.c_size_t | int,
+    size: Union[ctypes.c_size_t, int],
     dest_seq_id: llama_seq_id,
     /,
 ) -> int:
     """Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence"""
+    ...
 
 
 # LLAMA_API size_t llama_state_seq_save_file(
@@ -2266,7 +2310,7 @@ def llama_state_seq_save_file(
     filepath: bytes,
     seq_id: llama_seq_id,
     tokens: CtypesArray[llama_token],
-    n_token_count: ctypes.c_size_t | int,
+    n_token_count: Union[ctypes.c_size_t, int],
     /,
 ) -> int:
     ...
@@ -2296,7 +2340,7 @@ def llama_state_seq_load_file(
     filepath: bytes,
     dest_seq_id: llama_seq_id,
     tokens_out: CtypesArray[llama_token],
-    n_token_capacity: ctypes.c_size_t | int,
+    n_token_capacity: Union[ctypes.c_size_t, int],
     n_token_count_out: CtypesPointerOrRef[ctypes.c_size_t],
     /,
 ) -> int:
@@ -2329,8 +2373,8 @@ def llama_state_seq_load_file(
 )
 def llama_batch_get_one(
     tokens: CtypesArray[llama_token],
-    n_tokens: ctypes.c_int | int,
-    pos_0: llama_pos | int,
+    n_tokens: Union[ctypes.c_int, int],
+    pos_0: Union[llama_pos, int],
     seq_id: llama_seq_id,
     /,
 ) -> llama_batch:
@@ -2338,6 +2382,7 @@ def llama_batch_get_one(
 
     NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
     """
+    ...
 
 
 # // Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
@@ -2352,12 +2397,12 @@ def llama_batch_get_one(
 #         int32_t embd,
 #         int32_t n_seq_max);
 @ctypes_function(
-    "llama_batch_init", [ctypes.c_int32, ctypes.c_int32, ctypes.c_int32], llama_batch,
+    "llama_batch_init", [ctypes.c_int32, ctypes.c_int32, ctypes.c_int32], llama_batch
 )
 def llama_batch_init(
-    n_tokens: ctypes.c_int32 | int,
-    embd: ctypes.c_int32 | int,
-    n_seq_max: ctypes.c_int32 | int,
+    n_tokens: Union[ctypes.c_int32, int],
+    embd: Union[ctypes.c_int32, int],
+    n_seq_max: Union[ctypes.c_int32, int],
     /,
 ) -> llama_batch:
     """Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
@@ -2367,6 +2412,7 @@ def llama_batch_init(
     Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
     The rest of the llama_batch members are allocated with size n_tokens
     All members are left uninitialized"""
+    ...
 
 
 # // Frees a batch of tokens allocated with llama_batch_init()
@@ -2374,6 +2420,7 @@ def llama_batch_init(
 @ctypes_function("llama_batch_free", [llama_batch], None)
 def llama_batch_free(batch: llama_batch, /):
     """Frees a batch of tokens allocated with llama_batch_init()"""
+    ...
 
 
 # // Processes a batch of tokens with the ecoder part of the encoder-decoder model.
@@ -2389,6 +2436,7 @@ def llama_encode(ctx: llama_context_p, batch: llama_batch, /) -> int:
     Stores the encoder output internally for later use by the decoder cross-attention layers.
     0 - success
     < 0 - error"""
+    ...
 
 
 # // Positive return values does not mean a fatal error, but rather a warning.
@@ -2404,6 +2452,7 @@ def llama_decode(ctx: llama_context_p, batch: llama_batch, /) -> int:
     0 - success
     1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
     < 0 - error"""
+    ...
 
 
 # // Set the number of threads used for decoding
@@ -2421,14 +2470,15 @@ def llama_decode(ctx: llama_context_p, batch: llama_batch, /) -> int:
 )
 def llama_set_n_threads(
     ctx: llama_context_p,
-    n_threads: ctypes.c_uint32 | int,
-    n_threads_batch: ctypes.c_uint32 | int,
+    n_threads: Union[ctypes.c_uint32, int],
+    n_threads_batch: Union[ctypes.c_uint32, int],
     /,
 ):
     """Set the number of threads used for decoding
     n_threads is the number of threads used for generation (single token)
     n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
     """
+    ...
 
 
 # // Get the number of threads used for generation of a single token.
@@ -2436,6 +2486,7 @@ def llama_set_n_threads(
 @ctypes_function("llama_n_threads", [llama_context_p_ctypes], ctypes.c_uint32)
 def llama_n_threads(ctx: llama_context_p, /) -> int:
     """Get the number of threads used for generation of a single token"""
+    ...
 
 
 # // Get the number of threads used for prompt and batch processing (multiple token).
@@ -2443,6 +2494,7 @@ def llama_n_threads(ctx: llama_context_p, /) -> int:
 @ctypes_function("llama_n_threads_batch", [llama_context_p_ctypes], ctypes.c_uint32)
 def llama_n_threads_batch(ctx: llama_context_p, /) -> int:
     """Get the number of threads used for prompt and batch processing (multiple token)"""
+    ...
 
 
 # // Set whether the model is in embeddings mode or not
@@ -2452,6 +2504,7 @@ def llama_n_threads_batch(ctx: llama_context_p, /) -> int:
 def llama_set_embeddings(ctx: llama_context_p, embeddings: bool, /):
     """Set whether the model is in embeddings model or not
     If true, embeddings will be returned but logits will not"""
+    ...
 
 
 # // Set whether to use causal attention or not
@@ -2461,6 +2514,7 @@ def llama_set_embeddings(ctx: llama_context_p, embeddings: bool, /):
 def llama_set_causal_attn(ctx: llama_context_p, causal_attn: bool, /):
     """Set whether to use causal attention or not
     If set to true, the model will only attend to the past tokens"""
+    ...
 
 
 # // Set abort callback
@@ -2477,6 +2531,7 @@ def llama_set_abort_callback(
     /,
 ):
     """Set abort callback"""
+    ...
 
 
 # // Wait until all computations are finished
@@ -2488,6 +2543,7 @@ def llama_synchronize(ctx: llama_context_p, /):
     """Wait until all computations are finished
     This is automatically done when using one of the functions below to obtain the computation results
     and is not necessary to call it explicitly in most cases"""
+    ...
 
 
 # // Token logits obtained from the last call to llama_decode()
@@ -2497,7 +2553,7 @@ def llama_synchronize(ctx: llama_context_p, /):
 # // Cols: n_vocab
 # LLAMA_API float * llama_get_logits(struct llama_context * ctx);
 @ctypes_function(
-    "llama_get_logits", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float),
+    "llama_get_logits", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float)
 )
 def llama_get_logits(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]:
     """Token logits obtained from the last call to llama_eval()
@@ -2508,6 +2564,7 @@ def llama_get_logits(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]:
 
     Returns:
         Pointer to the logits buffer of shape (n_tokens, n_vocab)"""
+    ...
 
 
 # // Logits for the ith token. For positive indices, Equivalent to:
@@ -2521,10 +2578,11 @@ def llama_get_logits(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]:
     ctypes.POINTER(ctypes.c_float),
 )
 def llama_get_logits_ith(
-    ctx: llama_context_p, i: ctypes.c_int32 | int, /,
+    ctx: llama_context_p, i: Union[ctypes.c_int32, int], /
 ) -> CtypesArray[ctypes.c_float]:
     """Logits for the ith token. Equivalent to:
     llama_get_logits(ctx) + i*n_vocab"""
+    ...
 
 
 # // Get all output token embeddings.
@@ -2535,11 +2593,12 @@ def llama_get_logits_ith(
 # // Otherwise, returns NULL.
 # LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
 @ctypes_function(
-    "llama_get_embeddings", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float),
+    "llama_get_embeddings", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float)
 )
 def llama_get_embeddings(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]:
     """Get the embeddings for the input
     shape: [n_embd] (1-dimensional)"""
+    ...
 
 
 # // Get the embeddings for the ith token. For positive indices, Equivalent to:
@@ -2554,10 +2613,11 @@ def llama_get_embeddings(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]
     ctypes.POINTER(ctypes.c_float),
 )
 def llama_get_embeddings_ith(
-    ctx: llama_context_p, i: ctypes.c_int32 | int, /,
+    ctx: llama_context_p, i: Union[ctypes.c_int32, int], /
 ) -> CtypesArray[ctypes.c_float]:
     """Get the embeddings for the ith sequence
     llama_get_embeddings(ctx) + i*n_embd"""
+    ...
 
 
 # // Get the embeddings for a sequence id
@@ -2570,11 +2630,12 @@ def llama_get_embeddings_ith(
     ctypes.POINTER(ctypes.c_float),
 )
 def llama_get_embeddings_seq(
-    ctx: llama_context_p, seq_id: llama_seq_id | int, /,
+    ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /
 ) -> CtypesArray[ctypes.c_float]:
     """Get the embeddings for a sequence id
     Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
     shape: [n_embd] (1-dimensional)"""
+    ...
 
 
 # //
@@ -2584,30 +2645,30 @@ def llama_get_embeddings_seq(
 
 # LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token);
 @ctypes_function(
-    "llama_token_get_text", [llama_model_p_ctypes, llama_token], ctypes.c_char_p,
+    "llama_token_get_text", [llama_model_p_ctypes, llama_token], ctypes.c_char_p
 )
 def llama_token_get_text(
-    model: llama_model_p, token: llama_token | int, /,
+    model: llama_model_p, token: Union[llama_token, int], /
 ) -> bytes:
     ...
 
 
 # LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
 @ctypes_function(
-    "llama_token_get_score", [llama_model_p_ctypes, llama_token], ctypes.c_float,
+    "llama_token_get_score", [llama_model_p_ctypes, llama_token], ctypes.c_float
 )
 def llama_token_get_score(
-    model: llama_model_p, token: llama_token | int, /,
+    model: llama_model_p, token: Union[llama_token, int], /
 ) -> float:
     ...
 
 
 # LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token);
 @ctypes_function(
-    "llama_token_get_attr", [llama_model_p_ctypes, llama_token], ctypes.c_int,
+    "llama_token_get_attr", [llama_model_p_ctypes, llama_token], ctypes.c_int
 )
 def llama_token_get_attr(
-    model: llama_model_p, token: llama_token | int, /,
+    model: llama_model_p, token: Union[llama_token, int], /
 ) -> int:
     ...
 
@@ -2615,21 +2676,23 @@ def llama_token_get_attr(
 # // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
 # LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
 @ctypes_function(
-    "llama_token_is_eog", [llama_model_p_ctypes, llama_token], ctypes.c_bool,
+    "llama_token_is_eog", [llama_model_p_ctypes, llama_token], ctypes.c_bool
 )
-def llama_token_is_eog(model: llama_model_p, token: llama_token | int, /) -> bool:
+def llama_token_is_eog(model: llama_model_p, token: Union[llama_token, int], /) -> bool:
     """Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)"""
+    ...
 
 
 # // Identify if Token Id is a control token or a render-able token
 # LLAMA_API bool llama_token_is_control(const struct llama_model * model, llama_token token);
 @ctypes_function(
-    "llama_token_is_control", [llama_model_p_ctypes, llama_token], ctypes.c_bool,
+    "llama_token_is_control", [llama_model_p_ctypes, llama_token], ctypes.c_bool
 )
 def llama_token_is_control(
-    model: llama_model_p, token: llama_token | int, /,
+    model: llama_model_p, token: Union[llama_token, int], /
 ) -> bool:
     """Identify if Token Id is a control token or a render-able token"""
+    ...
 
 
 # // Special tokens
@@ -2639,30 +2702,35 @@ def llama_token_is_control(
 @ctypes_function("llama_token_bos", [llama_model_p_ctypes], llama_token)
 def llama_token_bos(model: llama_model_p, /) -> int:
     """beginning-of-sentence"""
+    ...
 
 
 # LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
 @ctypes_function("llama_token_eos", [llama_model_p_ctypes], llama_token)
 def llama_token_eos(model: llama_model_p, /) -> int:
     """end-of-sentence"""
+    ...
 
 
 # LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
 @ctypes_function("llama_token_cls", [llama_model_p_ctypes], llama_token)
 def llama_token_cls(model: llama_model_p, /) -> int:
     """classification"""
+    ...
 
 
 # LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
 @ctypes_function("llama_token_sep", [llama_model_p_ctypes], llama_token)
 def llama_token_sep(model: llama_model_p, /) -> int:
     """sentence separator"""
+    ...
 
 
 # LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
 @ctypes_function("llama_token_nl", [llama_model_p_ctypes], llama_token)
 def llama_token_nl(model: llama_model_p, /) -> int:
     """next-line"""
+    ...
 
 
 # // Returns -1 if unknown, 1 for true or 0 for false.
@@ -2670,6 +2738,7 @@ def llama_token_nl(model: llama_model_p, /) -> int:
 @ctypes_function("llama_add_bos_token", [llama_model_p_ctypes], ctypes.c_int32)
 def llama_add_bos_token(model: llama_model_p, /) -> int:
     """Returns -1 if unknown, 1 for true or 0 for false."""
+    ...
 
 
 # // Returns -1 if unknown, 1 for true or 0 for false.
@@ -2677,6 +2746,7 @@ def llama_add_bos_token(model: llama_model_p, /) -> int:
 @ctypes_function("llama_add_eos_token", [llama_model_p_ctypes], ctypes.c_int32)
 def llama_add_eos_token(model: llama_model_p, /) -> int:
     """Returns -1 if unknown, 1 for true or 0 for false."""
+    ...
 
 
 # // Codellama infill tokens
@@ -2684,6 +2754,7 @@ def llama_add_eos_token(model: llama_model_p, /) -> int:
 @ctypes_function("llama_token_prefix", [llama_model_p_ctypes], llama_token)
 def llama_token_prefix(model: llama_model_p) -> int:
     """codellama infill tokens"""
+    ...
 
 
 # LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
@@ -2740,11 +2811,11 @@ def llama_token_eot(model: llama_model_p, /) -> int:
 def llama_tokenize(
     model: llama_model_p,
     text: bytes,
-    text_len: ctypes.c_int | int,
+    text_len: Union[ctypes.c_int, int],
     tokens: CtypesArray[llama_token],
-    n_tokens_max: ctypes.c_int | int,
-    add_special: ctypes.c_bool | bool,
-    parse_special: ctypes.c_bool | bool,
+    n_tokens_max: Union[ctypes.c_int, int],
+    add_special: Union[ctypes.c_bool, bool],
+    parse_special: Union[ctypes.c_bool, bool],
     /,
 ) -> int:
     """Convert the provided text into tokens.
@@ -2762,6 +2833,7 @@ def llama_tokenize(
         Returns the number of tokens on success, no more than n_tokens_max
         Returns a negative number on failure - the number of tokens that would have been returned
     """
+    ...
 
 
 # // Token Id -> Piece.
@@ -2790,11 +2862,11 @@ def llama_tokenize(
 )
 def llama_token_to_piece(
     model: llama_model_p,
-    token: llama_token | int,
-    buf: ctypes.c_char_p | bytes | CtypesArray[ctypes.c_char],
-    length: ctypes.c_int | int,
-    lstrip: ctypes.c_int | int,
-    special: ctypes.c_bool | bool,
+    token: Union[llama_token, int],
+    buf: Union[ctypes.c_char_p, bytes, CtypesArray[ctypes.c_char]],
+    length: Union[ctypes.c_int, int],
+    lstrip: Union[ctypes.c_int, int],
+    special: Union[ctypes.c_bool, bool],
     /,
 ) -> int:
     """Token Id -> Piece.
@@ -2809,6 +2881,7 @@ def llama_token_to_piece(
         length: The length of the buffer.
         lstrip: The number of leading spaces to skip.
         special: If true, special tokens are rendered in the output."""
+    ...
 
 
 # /// @details Convert the provided tokens into text (inverse of llama_tokenize()).
@@ -2841,11 +2914,11 @@ def llama_token_to_piece(
 def llama_detokenize(
     model: llama_model_p,
     tokens: CtypesArray[llama_token],
-    n_tokens: ctypes.c_int | int,
+    n_tokens: Union[ctypes.c_int, int],
     text: bytes,
-    text_len_max: ctypes.c_int | int,
-    remove_special: ctypes.c_bool | bool,
-    unparse_special: ctypes.c_bool | bool,
+    text_len_max: Union[ctypes.c_int, int],
+    remove_special: Union[ctypes.c_bool, bool],
+    unparse_special: Union[ctypes.c_bool, bool],
     /,
 ) -> int:
     """Convert the provided tokens into text (inverse of llama_tokenize()).
@@ -2858,6 +2931,7 @@ def llama_detokenize(
         text_len_max: The length of the buffer.
         remove_special: Allow to remove BOS and EOS tokens if model is configured to do so.
         unparse_special: If true, special tokens are rendered in the output."""
+    ...
 
 
 # //
@@ -2925,11 +2999,12 @@ def llama_grammar_init(
     rules: CtypesArray[
         CtypesPointer[llama_grammar_element]
     ],  # NOTE: This might be wrong type sig
-    n_rules: ctypes.c_size_t | int,
-    start_rule_index: ctypes.c_size_t | int,
+    n_rules: Union[ctypes.c_size_t, int],
+    start_rule_index: Union[ctypes.c_size_t, int],
     /,
-) -> llama_grammar_p | None:
+) -> llama_grammar_p:
     """Initialize a grammar from a set of rules."""
+    ...
 
 
 # LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
@@ -2940,6 +3015,7 @@ def llama_grammar_init(
 )
 def llama_grammar_free(grammar: llama_grammar_p, /):
     """Free a grammar."""
+    ...
 
 
 # LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
@@ -2950,6 +3026,7 @@ def llama_grammar_free(grammar: llama_grammar_p, /):
 )
 def llama_grammar_copy(grammar: llama_grammar_p, /) -> llama_grammar_p:
     """Copy a grammar."""
+    ...
 
 
 # /// @details Apply constraints from grammar
@@ -2969,10 +3046,13 @@ def llama_grammar_copy(grammar: llama_grammar_p, /) -> llama_grammar_p:
 def llama_grammar_sample(
     grammar: llama_grammar_p,
     ctx: llama_context_p,
-    candidates: CtypesArray[llama_token_data_array] | CtypesPointerOrRef[llama_token_data_array],
+    candidates: Union[
+        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array]
+    ],
     /,
 ):
     """Apply constraints from grammar"""
+    ...
 
 
 # LLAMA_API DEPRECATED(void llama_sample_grammar(
@@ -2987,7 +3067,9 @@ def llama_grammar_sample(
 )
 def llama_sample_grammar(
     ctx: llama_context_p,
-    candidates: CtypesArray[llama_token_data_array] | CtypesPointerOrRef[llama_token_data_array],
+    candidates: Union[
+        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array]
+    ],
     grammar,  # type: llama_grammar_p
     /,
 ):
@@ -2997,6 +3079,7 @@ def llama_sample_grammar(
         candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
         grammar: A grammar object containing the rules and constraints to apply to the generated text.
     """
+    ...
 
 
 # /// @details Accepts the sampled token into the grammar
@@ -3012,10 +3095,11 @@ def llama_sample_grammar(
 def llama_grammar_accept_token(
     grammar: llama_grammar_p,
     ctx: llama_context_p,
-    token: llama_token | int,
+    token: Union[llama_token, int],
     /,
 ):
     """Accepts the sampled token into the grammar"""
+    ...
 
 
 # //
@@ -3030,8 +3114,9 @@ def llama_grammar_accept_token(
     [llama_context_p_ctypes, ctypes.c_uint32],
     None,
 )
-def llama_set_rng_seed(ctx: llama_context_p, seed: ctypes.c_uint32 | int, /):
+def llama_set_rng_seed(ctx: llama_context_p, seed: Union[ctypes.c_uint32, int], /):
     """Sets the current rng seed."""
+    ...
 
 
 # /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
@@ -3059,17 +3144,20 @@ def llama_set_rng_seed(ctx: llama_context_p, seed: ctypes.c_uint32 | int, /):
 )
 def llama_sample_repetition_penalties(
     ctx: llama_context_p,
-    candidates: CtypesArray[llama_token_data_array] | CtypesPointerOrRef[llama_token_data_array],
+    candidates: Union[
+        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array]
+    ],
     last_tokens_data: CtypesArray[llama_token],
-    penalty_last_n: ctypes.c_size_t | int,
-    penalty_repeat: ctypes.c_float | float,
-    penalty_freq: ctypes.c_float | float,
-    penalty_present: ctypes.c_float | float,
+    penalty_last_n: Union[ctypes.c_size_t, int],
+    penalty_repeat: Union[ctypes.c_float, float],
+    penalty_freq: Union[ctypes.c_float, float],
+    penalty_present: Union[ctypes.c_float, float],
     /,
 ):
     """Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
     Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
     """
+    ...
 
 
 # /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
@@ -3095,10 +3183,11 @@ def llama_sample_apply_guidance(
     ctx: llama_context_p,
     logits: CtypesArray[ctypes.c_float],
     logits_guidance: CtypesArray[ctypes.c_float],
-    scale: ctypes.c_float | float,
+    scale: Union[ctypes.c_float, float],
     /,
 ):
     """Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806"""
+    ...
 
 
 # /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
@@ -3112,10 +3201,13 @@ def llama_sample_apply_guidance(
 )
 def llama_sample_softmax(
     ctx: llama_context_p,
-    candidates: CtypesArray[llama_token_data_array] | CtypesPointerOrRef[llama_token_data_array],
+    candidates: Union[
+        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array]
+    ],
     /,
 ):
     """Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits."""
+    ...
 
 
 # /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
@@ -3131,12 +3223,15 @@ def llama_sample_softmax(
 )
 def llama_sample_top_k(
     ctx: llama_context_p,
-    candidates: CtypesArray[llama_token_data_array] | CtypesPointerOrRef[llama_token_data_array],
-    k: ctypes.c_int | int,
-    min_keep: ctypes.c_size_t | int,
+    candidates: Union[
+        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array]
+    ],
+    k: Union[ctypes.c_int, int],
+    min_keep: Union[ctypes.c_size_t, int],
     /,
 ):
     """Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751"""
+    ...
 
 
 # /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
@@ -3152,12 +3247,15 @@ def llama_sample_top_k(
 )
 def llama_sample_top_p(
     ctx: llama_context_p,
-    candidates: CtypesArray[llama_token_data_array] | CtypesPointerOrRef[llama_token_data_array],
-    p: ctypes.c_float | float,
-    min_keep: ctypes.c_size_t | int,
+    candidates: Union[
+        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array]
+    ],
+    p: Union[ctypes.c_float, float],
+    min_keep: Union[ctypes.c_size_t, int],
     /,
 ):
     """Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751"""
+    ...
 
 
 # /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
@@ -3173,12 +3271,15 @@ def llama_sample_top_p(
 )
 def llama_sample_min_p(
     ctx: llama_context_p,
-    candidates: CtypesArray[llama_token_data_array] | CtypesPointerOrRef[llama_token_data_array],
-    p: ctypes.c_float | float,
-    min_keep: ctypes.c_size_t | int,
+    candidates: Union[
+        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array]
+    ],
+    p: Union[ctypes.c_float, float],
+    min_keep: Union[ctypes.c_size_t, int],
     /,
 ):
     """Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841"""
+    ...
 
 
 # /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
@@ -3194,12 +3295,15 @@ def llama_sample_min_p(
 )
 def llama_sample_tail_free(
     ctx: llama_context_p,
-    candidates: CtypesArray[llama_token_data_array] | CtypesPointerOrRef[llama_token_data_array],
-    z: ctypes.c_float | float,
-    min_keep: ctypes.c_size_t | int,
+    candidates: Union[
+        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array]
+    ],
+    z: Union[ctypes.c_float, float],
+    min_keep: Union[ctypes.c_size_t, int],
     /,
 ):
     """Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/."""
+    ...
 
 
 # /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
@@ -3215,12 +3319,15 @@ def llama_sample_tail_free(
 )
 def llama_sample_typical(
     ctx: llama_context_p,
-    candidates: CtypesArray[llama_token_data_array] | CtypesPointerOrRef[llama_token_data_array],
-    p: ctypes.c_float | float,
-    min_keep: ctypes.c_size_t | int,
+    candidates: Union[
+        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array]
+    ],
+    p: Union[ctypes.c_float, float],
+    min_keep: Union[ctypes.c_size_t, int],
     /,
 ):
     """Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666."""
+    ...
 
 
 # /// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
@@ -3243,13 +3350,16 @@ def llama_sample_typical(
 )
 def llama_sample_entropy(
     ctx: llama_context_p,
-    candidates: CtypesArray[llama_token_data_array] | CtypesPointerOrRef[llama_token_data_array],
-    min_temp: ctypes.c_float | float,
-    max_temp: ctypes.c_float | float,
-    exponent_val: ctypes.c_float | float,
+    candidates: Union[
+        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array]
+    ],
+    min_temp: Union[ctypes.c_float, float],
+    max_temp: Union[ctypes.c_float, float],
+    exponent_val: Union[ctypes.c_float, float],
     /,
 ):
     """Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772."""
+    ...
 
 
 # LLAMA_API void llama_sample_temp(
@@ -3263,8 +3373,10 @@ def llama_sample_entropy(
 )
 def llama_sample_temp(
     ctx: llama_context_p,
-    candidates: CtypesArray[llama_token_data_array] | CtypesPointerOrRef[llama_token_data_array],
-    temp: ctypes.c_float | float,
+    candidates: Union[
+        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array]
+    ],
+    temp: Union[ctypes.c_float, float],
     /,
 ):
     """Temperature sampling described in academic paper "Generating Long Sequences with Sparse Transformers" https://arxiv.org/abs/1904.10509
@@ -3273,6 +3385,7 @@ def llama_sample_temp(
         candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
         temp: The temperature value to use for the sampling. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
     """
+    ...
 
 
 # /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
@@ -3302,10 +3415,12 @@ def llama_sample_temp(
 )
 def llama_sample_token_mirostat(
     ctx: llama_context_p,
-    candidates: CtypesArray[llama_token_data_array] | CtypesPointerOrRef[llama_token_data_array],
-    tau: ctypes.c_float | float,
-    eta: ctypes.c_float | float,
-    m: ctypes.c_int | int,
+    candidates: Union[
+        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array]
+    ],
+    tau: Union[ctypes.c_float, float],
+    eta: Union[ctypes.c_float, float],
+    m: Union[ctypes.c_int, int],
     mu: CtypesPointerOrRef[ctypes.c_float],
     /,
 ) -> int:
@@ -3318,6 +3433,7 @@ def llama_sample_token_mirostat(
         m: The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
         mu: Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
     """
+    ...
 
 
 # /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
@@ -3344,9 +3460,11 @@ def llama_sample_token_mirostat(
 )
 def llama_sample_token_mirostat_v2(
     ctx: llama_context_p,
-    candidates: CtypesArray[llama_token_data_array] | CtypesPointerOrRef[llama_token_data_array],
-    tau: ctypes.c_float | float,
-    eta: ctypes.c_float | float,
+    candidates: Union[
+        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array]
+    ],
+    tau: Union[ctypes.c_float, float],
+    eta: Union[ctypes.c_float, float],
     mu: CtypesPointerOrRef[ctypes.c_float],
     /,
 ) -> int:
@@ -3358,6 +3476,7 @@ def llama_sample_token_mirostat_v2(
         eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
         mu: Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
     """
+    ...
 
 
 # /// @details Selects the token with the highest probability.
@@ -3372,10 +3491,13 @@ def llama_sample_token_mirostat_v2(
 )
 def llama_sample_token_greedy(
     ctx: llama_context_p,
-    candidates: CtypesArray[llama_token_data_array] | CtypesPointerOrRef[llama_token_data_array],
+    candidates: Union[
+        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array]
+    ],
     /,
 ) -> int:
     """Selects the token with the highest probability."""
+    ...
 
 
 # /// @details Randomly selects a token from the candidates based on their probabilities using the RNG of ctx.
@@ -3389,10 +3511,13 @@ def llama_sample_token_greedy(
 )
 def llama_sample_token(
     ctx: llama_context_p,
-    candidates: CtypesArray[llama_token_data_array] | CtypesPointerOrRef[llama_token_data_array],
+    candidates: Union[
+        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array]
+    ],
     /,
 ) -> int:
     """Randomly selects a token from the candidates based on their probabilities."""
+    ...
 
 
 # //
@@ -3411,13 +3536,14 @@ def llama_sample_token(
 )
 def llama_split_path(
     split_path: bytes,
-    maxlen: ctypes.c_size_t | int,
+    maxlen: Union[ctypes.c_size_t, int],
     path_prefix: bytes,
-    split_no: ctypes.c_int | int,
-    split_count: ctypes.c_int | int,
+    split_no: Union[ctypes.c_int, int],
+    split_count: Union[ctypes.c_int, int],
     /,
 ) -> int:
     """Build a split GGUF final path for this chunk."""
+    ...
 
 
 # /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
@@ -3431,13 +3557,14 @@ def llama_split_path(
 )
 def llama_split_prefix(
     split_prefix: bytes,
-    maxlen: ctypes.c_size_t | int,
+    maxlen: Union[ctypes.c_size_t, int],
     split_path: bytes,
-    split_no: ctypes.c_int | int,
-    split_count: ctypes.c_int | int,
+    split_no: Union[ctypes.c_int, int],
+    split_count: Union[ctypes.c_int, int],
     /,
 ) -> int:
     """Extract the path prefix from the split_path if and only if the split_no and split_count match."""
+    ...
 
 
 # Performance information
@@ -3451,6 +3578,7 @@ def llama_split_prefix(
 )
 def llama_get_timings(ctx: llama_context_p, /) -> llama_timings:
     """Get performance information"""
+    ...
 
 
 # LLAMA_API void llama_print_timings(struct llama_context * ctx);
@@ -3461,6 +3589,7 @@ def llama_get_timings(ctx: llama_context_p, /) -> llama_timings:
 )
 def llama_print_timings(ctx: llama_context_p, /):
     """Print performance information"""
+    ...
 
 
 # LLAMA_API void llama_reset_timings(struct llama_context * ctx);
@@ -3471,6 +3600,7 @@ def llama_print_timings(ctx: llama_context_p, /):
 )
 def llama_reset_timings(ctx: llama_context_p, /):
     """Reset performance information"""
+    ...
 
 
 # Print system information
@@ -3482,6 +3612,7 @@ def llama_reset_timings(ctx: llama_context_p, /):
 )
 def llama_print_system_info() -> bytes:
     """Print system information"""
+    ...
 
 
 # NOTE: THIS IS CURRENTLY BROKEN AS ggml_log_callback IS NOT EXPOSED IN LLAMA.H
@@ -3494,13 +3625,14 @@ def llama_print_system_info() -> bytes:
     None,
 )
 def llama_log_set(
-    log_callback: CtypesFuncPointer | None,
+    log_callback: Optional[CtypesFuncPointer],
     user_data: ctypes.c_void_p,
     /,
 ):
     """Set callback for all future logging events.
 
     If this is not called, or NULL is supplied, everything is output on stderr."""
+    ...
 
 
 # LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);

From 9a13636afa907537e737a4fd375908bda6968b2f Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 08:14:18 +0200
Subject: [PATCH 158/177] Update llama.py

---
 llama_cpp/llama.py | 385 +++++++++++++++++++++++----------------------
 1 file changed, 196 insertions(+), 189 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 6ddf5b7c7..e3d093e3d 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -12,15 +12,18 @@
 import uuid
 import warnings
 from collections import deque
-from collections.abc import Callable, Generator, Iterator, Sequence
 from pathlib import Path
 from typing import (
     Any,
+    Callable,
     Deque,
     Dict,
+    Generator,
+    Iterator,
     List,
     Literal,
     Optional,
+    Sequence,
     Union,
 )
 
@@ -62,19 +65,21 @@ def __init__(
         n_gpu_layers: int = 0,
         split_mode: int = llama_cpp.LLAMA_SPLIT_MODE_LAYER,
         main_gpu: int = 0,
-        tensor_split: list[float] | None = None,
-        rpc_servers: str | None = None,
+        tensor_split: Optional[List[float]] = None,
+        rpc_servers: Optional[str] = None,
         vocab_only: bool = False,
         use_mmap: bool = True,
         use_mlock: bool = False,
-        kv_overrides: dict[str, bool | int | float | str] | None = None,
+        kv_overrides: Optional[Dict[str, Union[bool, int, float, str]]] = None,
         # Context Params
         seed: int = llama_cpp.LLAMA_DEFAULT_SEED,
         n_ctx: int = 512,
         n_batch: int = 512,
-        n_threads: int | None = None,
-        n_threads_batch: int | None = None,
-        rope_scaling_type: int | None = llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
+        n_threads: Optional[int] = None,
+        n_threads_batch: Optional[int] = None,
+        rope_scaling_type: Optional[
+            int
+        ] = llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
         pooling_type: int = llama_cpp.LLAMA_POOLING_TYPE_UNSPECIFIED,
         rope_freq_base: float = 0.0,
         rope_freq_scale: float = 0.0,
@@ -90,21 +95,21 @@ def __init__(
         # Sampling Params
         last_n_tokens_size: int = 64,
         # LoRA Params
-        lora_base: str | None = None,
+        lora_base: Optional[str] = None,
         lora_scale: float = 1.0,
-        lora_path: str | None = None,
+        lora_path: Optional[str] = None,
         # Backend Params
-        numa: bool | int = False,
+        numa: Union[bool, int] = False,
         # Chat Format Params
-        chat_format: str | None = None,
-        chat_handler: llama_chat_format.LlamaChatCompletionHandler | None = None,
+        chat_format: Optional[str] = None,
+        chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None,
         # Speculative Decoding
-        draft_model: LlamaDraftModel | None = None,
+        draft_model: Optional[LlamaDraftModel] = None,
         # Tokenizer Override
-        tokenizer: BaseLlamaTokenizer | None = None,
+        tokenizer: Optional[BaseLlamaTokenizer] = None,
         # KV cache quantization
-        type_k: int | None = None,
-        type_v: int | None = None,
+        type_k: Optional[int] = None,
+        type_v: Optional[int] = None,
         # Misc
         spm_infill: bool = False,
         verbose: bool = True,
@@ -226,12 +231,12 @@ def __init__(
         if self.tensor_split is not None:
             if len(self.tensor_split) > llama_cpp.LLAMA_MAX_DEVICES:
                 raise ValueError(
-                    f"Attempt to split tensors that exceed maximum supported devices. Current LLAMA_MAX_DEVICES={llama_cpp.LLAMA_MAX_DEVICES}",
+                    f"Attempt to split tensors that exceed maximum supported devices. Current LLAMA_MAX_DEVICES={llama_cpp.LLAMA_MAX_DEVICES}"
                 )
             # Type conversion and expand the list to the length of LLAMA_MAX_DEVICES
             FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES
             self._c_tensor_split = FloatArray(
-                *tensor_split,  # type: ignore
+                *tensor_split  # type: ignore
             )  # keep a reference to the array so it is not gc'd
             self.model_params.tensor_split = self._c_tensor_split
         self.model_params.vocab_only = vocab_only
@@ -342,7 +347,7 @@ def __init__(
         # Sampling Params
         self.last_n_tokens_size = last_n_tokens_size
 
-        self.cache: BaseLlamaCache | None = None
+        self.cache: Optional[BaseLlamaCache] = None
 
         self.lora_base = lora_base
         self.lora_scale = lora_scale
@@ -361,8 +366,8 @@ def __init__(
                     path_model=self.model_path,
                     params=self.model_params,
                     verbose=self.verbose,
-                ),
-            ),
+                )
+            )
         )
 
         # Override tokenizer
@@ -381,8 +386,8 @@ def __init__(
                     model=self._model,
                     params=self.context_params,
                     verbose=self.verbose,
-                ),
-            ),
+                )
+            )
         )
 
         self._batch = self._stack.enter_context(
@@ -392,11 +397,11 @@ def __init__(
                     embd=0,
                     n_seq_max=self.context_params.n_ctx,
                     verbose=self.verbose,
-                ),
-            ),
+                )
+            )
         )
 
-        self._lora_adapter: llama_cpp.llama_lora_adapter_p | None = None
+        self._lora_adapter: Optional[llama_cpp.llama_lora_adapter_p] = None
 
         if self.lora_path:
             assert self._model.model is not None
@@ -406,14 +411,14 @@ def __init__(
             )
             if self._lora_adapter is None:
                 raise RuntimeError(
-                    f"Failed to initialize LoRA adapter from lora path: {self.lora_path}",
+                    f"Failed to initialize LoRA adapter from lora path: {self.lora_path}"
                 )
             assert self._ctx.ctx is not None
             if llama_cpp.llama_lora_adapter_set(
-                self._ctx.ctx, self._lora_adapter, self.lora_scale,
+                self._ctx.ctx, self._lora_adapter, self.lora_scale
             ):
                 raise RuntimeError(
-                    f"Failed to set LoRA adapter from lora path: {self.lora_path}",
+                    f"Failed to set LoRA adapter from lora path: {self.lora_path}"
                 )
 
         if self.verbose:
@@ -421,7 +426,7 @@ def __init__(
 
         self.chat_format = chat_format
         self.chat_handler = chat_handler
-        self._chat_handlers: dict[str, llama_chat_format.LlamaChatCompletionHandler] = (
+        self._chat_handlers: Dict[str, llama_chat_format.LlamaChatCompletionHandler] = (
             {}
         )
 
@@ -438,11 +443,11 @@ def __init__(
         self.n_tokens = 0
         self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc)
         self.scores: npt.NDArray[np.single] = np.ndarray(
-            (n_ctx, self._n_vocab), dtype=np.single,
+            (n_ctx, self._n_vocab), dtype=np.single
         )
 
         self._mirostat_mu = ctypes.c_float(
-            2.0 * 5.0,
+            2.0 * 5.0
         )  # TODO: Move this to sampling context
 
         try:
@@ -497,7 +502,7 @@ def __init__(
             and "chat_template.default" in template_choices
         ):
             chat_format = llama_chat_format.guess_chat_format_from_gguf_metadata(
-                self.metadata,
+                self.metadata
             )
 
             if chat_format is not None:
@@ -519,7 +524,7 @@ def __init__(
             self.chat_format = "llama-2"
             if self.verbose:
                 print(
-                    f"Using fallback chat format: {self.chat_format}", file=sys.stderr,
+                    f"Using fallback chat format: {self.chat_format}", file=sys.stderr
                 )
 
     @property
@@ -541,19 +546,19 @@ def _scores(self) -> npt.NDArray[np.single]:
         return self.scores[: self.n_tokens, :]
 
     @property
-    def eval_tokens(self) -> deque[int]:
+    def eval_tokens(self) -> Deque[int]:
         return deque(self.input_ids[: self.n_tokens].tolist(), maxlen=self._n_ctx)
 
     @property
-    def eval_logits(self) -> deque[list[float]]:
+    def eval_logits(self) -> Deque[List[float]]:
         return deque(
             self.scores[: self.n_tokens, :].tolist(),
             maxlen=self._n_ctx if self.context_params.logits_all else 1,
         )
 
     def tokenize(
-        self, text: bytes, add_bos: bool = True, special: bool = False,
-    ) -> list[int]:
+        self, text: bytes, add_bos: bool = True, special: bool = False
+    ) -> List[int]:
         """Tokenize a string.
 
         Args:
@@ -568,7 +573,7 @@ def tokenize(
         return self.tokenizer_.tokenize(text, add_bos, special)
 
     def detokenize(
-        self, tokens: list[int], prev_tokens: list[int] | None = None,
+        self, tokens: List[int], prev_tokens: Optional[List[int]] = None
     ) -> bytes:
         """Detokenize a list of tokens.
 
@@ -581,7 +586,7 @@ def detokenize(
         """
         return self.tokenizer_.detokenize(tokens, prev_tokens=prev_tokens)
 
-    def set_cache(self, cache: BaseLlamaCache | None):
+    def set_cache(self, cache: Optional[BaseLlamaCache]):
         """Set the cache.
 
         Args:
@@ -616,7 +621,7 @@ def eval(self, tokens: Sequence[int]):
             n_past = self.n_tokens
             n_tokens = len(batch)
             self._batch.set_batch(
-                batch=batch, n_past=n_past, logits_all=self.context_params.logits_all,
+                batch=batch, n_past=n_past, logits_all=self.context_params.logits_all
             )
             self._ctx.decode(self._batch)
             # Save tokens
@@ -626,14 +631,14 @@ def eval(self, tokens: Sequence[int]):
                 rows = n_tokens
                 cols = self._n_vocab
                 logits = np.ctypeslib.as_array(
-                    self._ctx.get_logits(), shape=(rows * cols,),
+                    self._ctx.get_logits(), shape=(rows * cols,)
                 )
                 self.scores[n_past : n_past + n_tokens, :].reshape(-1)[::] = logits
             else:
                 rows = 1
                 cols = self._n_vocab
                 logits = np.ctypeslib.as_array(
-                    self._ctx.get_logits(), shape=(rows * cols,),
+                    self._ctx.get_logits(), shape=(rows * cols,)
                 )
                 self.scores[n_past + n_tokens - 1, :].reshape(-1)[::] = logits
             # Update n_tokens
@@ -654,9 +659,9 @@ def sample(
         mirostat_eta: float = 0.1,
         mirostat_tau: float = 5.0,
         penalize_nl: bool = True,
-        logits_processor: LogitsProcessorList | None = None,
-        grammar: LlamaGrammar | None = None,
-        idx: int | None = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        grammar: Optional[LlamaGrammar] = None,
+        idx: Optional[int] = None,
     ):
         """Sample a token from the model.
 
@@ -730,10 +735,10 @@ def generate(
         mirostat_tau: float = 5.0,
         mirostat_eta: float = 0.1,
         penalize_nl: bool = True,
-        logits_processor: LogitsProcessorList | None = None,
-        stopping_criteria: StoppingCriteriaList | None = None,
-        grammar: LlamaGrammar | None = None,
-    ) -> Generator[int, Sequence[int] | None, None]:
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        grammar: Optional[LlamaGrammar] = None,
+    ) -> Generator[int, Optional[Sequence[int]], None]:
         """Create a generator of tokens from a prompt.
 
         Examples:
@@ -759,18 +764,17 @@ def generate(
         # Check for kv cache prefix match
         if reset and self.n_tokens > 0:
             longest_prefix = 0
-            for a, b in zip(self._input_ids, tokens[:-1], strict=False):
+            for a, b in zip(self._input_ids, tokens[:-1]):
                 if a == b:
                     longest_prefix += 1
                 else:
                     break
             if longest_prefix > 0:
+                if self.verbose:
+                    print("Llama.generate: prefix-match hit", file=sys.stderr)
                 reset = False
                 tokens = tokens[longest_prefix:]
                 self.n_tokens = longest_prefix
-                if self.verbose:
-                    print(f"Llama.generate: {longest_prefix} prefix-match hit, "
-                          f"remaining {len(tokens)} prompt tokens to eval", file=sys.stderr)
 
         # Reset the model state
         if reset:
@@ -808,7 +812,7 @@ def generate(
 
                 sample_idx += 1
                 if stopping_criteria is not None and stopping_criteria(
-                    self._input_ids, self._scores[-1, :],
+                    self._input_ids, self._scores[-1, :]
                 ):
                     return
                 tokens_or_none = yield token
@@ -825,16 +829,16 @@ def generate(
             if self.draft_model is not None:
                 self.input_ids[self.n_tokens : self.n_tokens + len(tokens)] = tokens
                 draft_tokens = self.draft_model(
-                    self.input_ids[: self.n_tokens + len(tokens)],
+                    self.input_ids[: self.n_tokens + len(tokens)]
                 )
                 tokens.extend(
                     draft_tokens.astype(int)[
                         : self._n_ctx - self.n_tokens - len(tokens)
-                    ],
+                    ]
                 )
 
     def create_embedding(
-        self, input: str | list[str], model: str | None = None,
+        self, input: Union[str, List[str]], model: Optional[str] = None
     ) -> CreateEmbeddingResponse:
         """Embed a string.
 
@@ -850,12 +854,12 @@ def create_embedding(
         input = input if isinstance(input, list) else [input]
 
         # get numeric embeddings
-        embeds: list[list[float]] | list[list[list[float]]]
+        embeds: Union[List[List[float]], List[List[List[float]]]]
         total_tokens: int
         embeds, total_tokens = self.embed(input, return_count=True)  # type: ignore
 
         # convert to CreateEmbeddingResponse
-        data: list[Embedding] = [
+        data: List[Embedding] = [
             {
                 "object": "embedding",
                 "embedding": emb,
@@ -876,7 +880,7 @@ def create_embedding(
 
     def embed(
         self,
-        input: str | list[str],
+        input: Union[str, List[str]],
         normalize: bool = False,
         truncate: bool = True,
         return_count: bool = False,
@@ -899,7 +903,7 @@ def embed(
 
         if self.context_params.embeddings is False:
             raise RuntimeError(
-                "Llama model must be created with embedding=True to call this method",
+                "Llama model must be created with embedding=True to call this method"
             )
 
         if self.verbose:
@@ -914,9 +918,9 @@ def embed(
         self._batch.reset()
 
         # decode and fetch embeddings
-        data: list[list[float]] | list[list[list[float]]] = []
+        data: Union[List[List[float]], List[List[List[float]]]] = []
 
-        def decode_batch(seq_sizes: list[int]):
+        def decode_batch(seq_sizes: List[int]):
             assert self._ctx.ctx is not None
             llama_cpp.llama_kv_cache_clear(self._ctx.ctx)
             self._ctx.decode(self._batch)
@@ -927,7 +931,7 @@ def decode_batch(seq_sizes: list[int]):
                 pos: int = 0
                 for i, size in enumerate(seq_sizes):
                     ptr = llama_cpp.llama_get_embeddings(self._ctx.ctx)
-                    embedding: list[list[float]] = [
+                    embedding: List[List[float]] = [
                         ptr[pos + j * n_embd : pos + (j + 1) * n_embd]
                         for j in range(size)
                     ]
@@ -938,7 +942,7 @@ def decode_batch(seq_sizes: list[int]):
             else:
                 for i in range(len(seq_sizes)):
                     ptr = llama_cpp.llama_get_embeddings_seq(self._ctx.ctx, i)
-                    embedding: list[float] = ptr[:n_embd]
+                    embedding: List[float] = ptr[:n_embd]
                     if normalize:
                         embedding = _normalize_embedding(embedding)
                     data.append(embedding)
@@ -961,7 +965,7 @@ def decode_batch(seq_sizes: list[int]):
             # check for overrun
             if n_tokens > n_batch:
                 raise ValueError(
-                    f"Requested tokens ({n_tokens}) exceed batch size of {n_batch}",
+                    f"Requested tokens ({n_tokens}) exceed batch size of {n_batch}"
                 )
 
             # time to eval batch
@@ -992,37 +996,39 @@ def decode_batch(seq_sizes: list[int]):
 
         if return_count:
             return output, total_tokens
-
-        return output
+        else:
+            return output
 
     def _create_completion(
         self,
-        prompt: str | list[int],
-        suffix: str | None = None,
-        max_tokens: int | None = 16,
+        prompt: Union[str, List[int]],
+        suffix: Optional[str] = None,
+        max_tokens: Optional[int] = 16,
         temperature: float = 0.8,
         top_p: float = 0.95,
         min_p: float = 0.05,
         typical_p: float = 1.0,
-        logprobs: int | None = None,
+        logprobs: Optional[int] = None,
         echo: bool = False,
-        stop: str | list[str] | None = [],
+        stop: Optional[Union[str, List[str]]] = [],
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
         repeat_penalty: float = 1.0,
         top_k: int = 40,
         stream: bool = False,
-        seed: int | None = None,
+        seed: Optional[int] = None,
         tfs_z: float = 1.0,
         mirostat_mode: int = 0,
         mirostat_tau: float = 5.0,
         mirostat_eta: float = 0.1,
-        model: str | None = None,
-        stopping_criteria: StoppingCriteriaList | None = None,
-        logits_processor: LogitsProcessorList | None = None,
-        grammar: LlamaGrammar | None = None,
-        logit_bias: dict[str, float] | None = None,
-    ) -> Iterator[CreateCompletionResponse] | Iterator[CreateCompletionStreamResponse]:
+        model: Optional[str] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        grammar: Optional[LlamaGrammar] = None,
+        logit_bias: Optional[Dict[str, float]] = None,
+    ) -> Union[
+        Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse]
+    ]:
         assert self._ctx is not None
         assert suffix is None or suffix.__class__ is str
 
@@ -1037,9 +1043,9 @@ def _create_completion(
         add_space_prefix: bool = (
             self.metadata.get("tokenizer.ggml.add_space_prefix", "true") == "true"
         )
-        bos_tokens: list[int] = [cls_token_id if cls_token_id != -1 else bos_token_id]
-        eos_tokens: list[int] = [
-            sep_token_id if sep_token_id != -1 else self.token_eos(),
+        bos_tokens: List[int] = [cls_token_id if cls_token_id != -1 else bos_token_id]
+        eos_tokens: List[int] = [
+            sep_token_id if sep_token_id != -1 else self.token_eos()
         ]
 
         if (
@@ -1062,9 +1068,9 @@ def _create_completion(
 
         # If prompt is empty, initialize completion with BOS token to avoid
         # detokenization including a space at the beginning of the completion
-        completion_tokens: list[int] = [] if len(prompt) > 0 else [bos_token_id]
+        completion_tokens: List[int] = [] if len(prompt) > 0 else [bos_token_id]
         # Add blank space to start of prompt to match OG llama tokenizer
-        prefix_tokens: list[int] = (
+        prefix_tokens: List[int] = (
             [prefix_token_id] if prefix_token_id >= 0 and suffix is not None else []
         ) + (
             (
@@ -1079,7 +1085,7 @@ def _create_completion(
             if isinstance(prompt, str)
             else prompt
         )
-        suffix_tokens: list[int] = (
+        suffix_tokens: List[int] = (
             (
                 [suffix_token_id]
                 + (
@@ -1093,10 +1099,10 @@ def _create_completion(
             if suffix_token_id >= 0 and suffix is not None
             else []
         )
-        middle_tokens: list[int] = (
+        middle_tokens: List[int] = (
             [middle_token_id] if middle_token_id >= 0 and suffix is not None else []
         )
-        prompt_tokens: list[int] = (
+        prompt_tokens: List[int] = (
             bos_tokens
             + (
                 (suffix_tokens + prefix_tokens + middle_tokens)
@@ -1128,7 +1134,7 @@ def logit_bias_processor(
                 scores: npt.NDArray[np.single],
             ) -> npt.NDArray[np.single]:
                 new_scores = np.copy(
-                    scores,
+                    scores
                 )  # Does it make sense to copy the whole array or can we just overwrite the original one?
                 for input_id, score in logit_bias_map.items():
                     new_scores[input_id] = score + scores[input_id]
@@ -1145,7 +1151,7 @@ def logit_bias_processor(
 
         if len(prompt_tokens) >= self._n_ctx:
             raise ValueError(
-                f"Requested tokens ({len(prompt_tokens)}) exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}",
+                f"Requested tokens ({len(prompt_tokens)}) exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
             )
 
         if max_tokens is None or max_tokens <= 0:
@@ -1166,17 +1172,17 @@ def logit_bias_processor(
 
         if logprobs is not None and self.context_params.logits_all is False:
             raise ValueError(
-                "logprobs is not supported for models created with logits_all=False",
+                "logprobs is not supported for models created with logits_all=False"
             )
 
         if self.cache:
             try:
                 cache_item = self.cache[prompt_tokens]
                 cache_prefix_len = Llama.longest_token_prefix(
-                    cache_item.input_ids.tolist(), prompt_tokens,
+                    cache_item.input_ids.tolist(), prompt_tokens
                 )
                 eval_prefix_len = Llama.longest_token_prefix(
-                    self._input_ids.tolist(), prompt_tokens,
+                    self._input_ids.tolist(), prompt_tokens
                 )
                 if cache_prefix_len > eval_prefix_len:
                     self.load_state(cache_item)
@@ -1271,7 +1277,7 @@ def logit_bias_processor(
                                 [token],
                                 prev_tokens=prompt_tokens
                                 + completion_tokens[:returned_tokens],
-                            ),
+                            )
                         )
                         # Check if stop sequence is in the token
                         if token_end_position > (
@@ -1288,18 +1294,18 @@ def logit_bias_processor(
                                 completion_tokens[:returned_tokens],
                                 prev_tokens=prompt_tokens
                                 + completion_tokens[:returned_tokens],
-                            ).decode("utf-8", errors="ignore"),
+                            ).decode("utf-8", errors="ignore")
                         )
                         token_offset = len(prompt_tokens) + returned_tokens
                         logits = self._scores[token_offset - 1, :]
                         current_logprobs = Llama.logits_to_logprobs(logits).tolist()
                         sorted_logprobs = sorted(
-                                zip(current_logprobs, range(len(current_logprobs)), strict=False),
+                                zip(current_logprobs, range(len(current_logprobs))),
                                 reverse=True,
                             )
                         top_logprob = {
                             self.detokenize([i]).decode(
-                                "utf-8", errors="ignore",
+                                "utf-8", errors="ignore"
                             ): logprob
                             for logprob, i in sorted_logprobs[:logprobs]
                         }
@@ -1310,7 +1316,7 @@ def logit_bias_processor(
                                     [token],
                                     prev_tokens=prompt_tokens
                                     + completion_tokens[:returned_tokens],
-                                ).decode("utf-8", errors="ignore"),
+                                ).decode("utf-8", errors="ignore")
                             ],
                             "text_offset": [text_offset],
                             "token_logprobs": [current_logprobs[int(token)]],
@@ -1332,7 +1338,7 @@ def logit_bias_processor(
                                     "index": 0,
                                     "logprobs": logprobs_or_none,
                                     "finish_reason": None,
-                                },
+                                }
                             ],
                         }
                 else:
@@ -1374,7 +1380,7 @@ def logit_bias_processor(
                                     "index": 0,
                                     "logprobs": None,
                                     "finish_reason": None,
-                                },
+                                }
                             ],
                         }
 
@@ -1384,7 +1390,7 @@ def logit_bias_processor(
                 break
 
         if stopping_criteria is not None and stopping_criteria(
-            self._input_ids, self._scores[-1, :],
+            self._input_ids, self._scores[-1, :]
         ):
             text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens)
             finish_reason = "stop"
@@ -1410,28 +1416,28 @@ def logit_bias_processor(
                     self.detokenize(
                         [token],
                         prev_tokens=prompt_tokens + completion_tokens[:returned_tokens],
-                    ),
+                    )
                 )
 
-                logprobs_or_none: CompletionLogprobs | None = None
+                logprobs_or_none: Optional[CompletionLogprobs] = None
                 if logprobs is not None:
                     if token == bos_token_id:
                         continue
                     token_str = self.detokenize([token]).decode(
-                        "utf-8", errors="ignore",
+                        "utf-8", errors="ignore"
                     )
                     text_offset = len(prompt) + len(
                         self.detokenize(
                             completion_tokens[:returned_tokens],
                             prev_tokens=prompt_tokens
                             + completion_tokens[:returned_tokens],
-                        ),
+                        )
                     )
                     token_offset = len(prompt_tokens) + returned_tokens - 1
                     logits = self._scores[token_offset, :]
                     current_logprobs = Llama.logits_to_logprobs(logits).tolist()
                     sorted_logprobs = sorted(
-                            zip(current_logprobs, range(len(current_logprobs)), strict=False),
+                            zip(current_logprobs, range(len(current_logprobs))),
                             reverse=True,
                         )
                     top_logprob = {
@@ -1441,7 +1447,7 @@ def logit_bias_processor(
                     top_logprob.update({token_str: current_logprobs[int(token)]})
                     logprobs_or_none = {
                         "tokens": [
-                            self.detokenize([token]).decode("utf-8", errors="ignore"),
+                            self.detokenize([token]).decode("utf-8", errors="ignore")
                         ],
                         "text_offset": [text_offset],
                         "token_logprobs": [current_logprobs[int(token)]],
@@ -1466,7 +1472,7 @@ def logit_bias_processor(
                                 "index": 0,
                                 "logprobs": logprobs_or_none,
                                 "finish_reason": None,
-                            },
+                            }
                         ],
                     }
                     break
@@ -1479,12 +1485,12 @@ def logit_bias_processor(
                     "choices": [
                         {
                             "text": self.detokenize([token]).decode(
-                                "utf-8", errors="ignore",
+                                "utf-8", errors="ignore"
                             ),
                             "index": 0,
                             "logprobs": logprobs_or_none,
                             "finish_reason": None,
-                        },
+                        }
                     ],
                 }
             yield {
@@ -1498,15 +1504,14 @@ def logit_bias_processor(
                         "index": 0,
                         "logprobs": None,
                         "finish_reason": finish_reason,
-                    },
+                    }
                 ],
             }
             if self.cache:
                 if self.verbose:
                     print("Llama._create_completion: cache save", file=sys.stderr)
                 self.cache[prompt_tokens + completion_tokens] = self.save_state()
-                if self.verbose:
-                    print("Llama._create_completion: cache saved", file=sys.stderr)
+                print("Llama._create_completion: cache saved", file=sys.stderr)
             return
 
         if self.cache:
@@ -1522,14 +1527,14 @@ def logit_bias_processor(
         if suffix_token_id < 0 and suffix is not None:
             text_str = text_str + suffix
 
-        logprobs_or_none: CompletionLogprobs | None = None
+        logprobs_or_none: Optional[CompletionLogprobs] = None
         if logprobs is not None:
             text_offset = 0 if echo else len(prompt)
             token_offset = 0 if echo else len(prompt_tokens[1:])
-            text_offsets: list[int] = []
-            token_logprobs: list[float | None] = []
-            tokens: list[str] = []
-            top_logprobs: list[dict[str, float] | None] = []
+            text_offsets: List[int] = []
+            token_logprobs: List[Optional[float]] = []
+            tokens: List[str] = []
+            top_logprobs: List[Optional[Dict[str, float]]] = []
 
             if echo:
                 # Remove leading BOS token if exists
@@ -1542,14 +1547,14 @@ def logit_bias_processor(
 
             all_token_strs = [
                 self.detokenize([token], prev_tokens=all_tokens[:i]).decode(
-                    "utf-8", errors="ignore",
+                    "utf-8", errors="ignore"
                 )
                 for i, token in enumerate(all_tokens)
             ]
             all_logprobs = Llama.logits_to_logprobs(self._scores)[token_offset:]
             # TODO: may be able to change this loop to use np.take_along_dim
             for idx, (token, token_str, logprobs_token) in enumerate(
-                zip(all_tokens, all_token_strs, all_logprobs, strict=False),
+                zip(all_tokens, all_token_strs, all_logprobs)
             ):
                 if token == bos_token_id:
                     continue
@@ -1557,18 +1562,18 @@ def logit_bias_processor(
                     text_offset
                     + len(
                         self.detokenize(all_tokens[:idx]).decode(
-                            "utf-8", errors="ignore",
-                        ),
-                    ),
+                            "utf-8", errors="ignore"
+                        )
+                    )
                 )
                 tokens.append(token_str)
                 sorted_logprobs = sorted(
-                        zip(logprobs_token, range(len(logprobs_token)), strict=False), reverse=True,
+                        zip(logprobs_token, range(len(logprobs_token))), reverse=True
                     )
                 token_logprobs.append(logprobs_token[int(token)])
-                top_logprob: dict[str, float] | None = {
+                top_logprob: Optional[Dict[str, float]] = {
                     self.detokenize([i], prev_tokens=all_tokens[:idx]).decode(
-                        "utf-8", errors="ignore",
+                        "utf-8", errors="ignore"
                     ): logprob
                     for logprob, i in sorted_logprobs[:logprobs]
                 }
@@ -1598,7 +1603,7 @@ def logit_bias_processor(
                     "index": 0,
                     "logprobs": logprobs_or_none,
                     "finish_reason": finish_reason,
-                },
+                }
             ],
             "usage": {
                 "prompt_tokens": len(prompt_tokens),
@@ -1609,32 +1614,32 @@ def logit_bias_processor(
 
     def create_completion(
         self,
-        prompt: str | list[int],
-        suffix: str | None = None,
-        max_tokens: int | None = 16,
+        prompt: Union[str, List[int]],
+        suffix: Optional[str] = None,
+        max_tokens: Optional[int] = 16,
         temperature: float = 0.8,
         top_p: float = 0.95,
         min_p: float = 0.05,
         typical_p: float = 1.0,
-        logprobs: int | None = None,
+        logprobs: Optional[int] = None,
         echo: bool = False,
-        stop: str | list[str] | None = [],
+        stop: Optional[Union[str, List[str]]] = [],
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
         repeat_penalty: float = 1.0,
         top_k: int = 40,
         stream: bool = False,
-        seed: int | None = None,
+        seed: Optional[int] = None,
         tfs_z: float = 1.0,
         mirostat_mode: int = 0,
         mirostat_tau: float = 5.0,
         mirostat_eta: float = 0.1,
-        model: str | None = None,
-        stopping_criteria: StoppingCriteriaList | None = None,
-        logits_processor: LogitsProcessorList | None = None,
-        grammar: LlamaGrammar | None = None,
-        logit_bias: dict[str, float] | None = None,
-    ) -> CreateCompletionResponse | Iterator[CreateCompletionStreamResponse]:
+        model: Optional[str] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        grammar: Optional[LlamaGrammar] = None,
+        logit_bias: Optional[Dict[str, float]] = None,
+    ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]:
         """Generate text from a prompt.
 
         Args:
@@ -1707,31 +1712,31 @@ def create_completion(
     def __call__(
         self,
         prompt: str,
-        suffix: str | None = None,
-        max_tokens: int | None = 16,
+        suffix: Optional[str] = None,
+        max_tokens: Optional[int] = 16,
         temperature: float = 0.8,
         top_p: float = 0.95,
         min_p: float = 0.05,
         typical_p: float = 1.0,
-        logprobs: int | None = None,
+        logprobs: Optional[int] = None,
         echo: bool = False,
-        stop: str | list[str] | None = [],
+        stop: Optional[Union[str, List[str]]] = [],
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
         repeat_penalty: float = 1.0,
         top_k: int = 40,
         stream: bool = False,
-        seed: int | None = None,
+        seed: Optional[int] = None,
         tfs_z: float = 1.0,
         mirostat_mode: int = 0,
         mirostat_tau: float = 5.0,
         mirostat_eta: float = 0.1,
-        model: str | None = None,
-        stopping_criteria: StoppingCriteriaList | None = None,
-        logits_processor: LogitsProcessorList | None = None,
-        grammar: LlamaGrammar | None = None,
-        logit_bias: dict[str, float] | None = None,
-    ) -> CreateCompletionResponse | Iterator[CreateCompletionStreamResponse]:
+        model: Optional[str] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        grammar: Optional[LlamaGrammar] = None,
+        logit_bias: Optional[Dict[str, float]] = None,
+    ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]:
         """Generate text from a prompt.
 
         Args:
@@ -1798,21 +1803,21 @@ def __call__(
 
     def create_chat_completion(
         self,
-        messages: list[ChatCompletionRequestMessage],
-        functions: list[ChatCompletionFunction] | None = None,
-        function_call: ChatCompletionRequestFunctionCall | None = None,
-        tools: list[ChatCompletionTool] | None = None,
-        tool_choice: ChatCompletionToolChoiceOption | None = None,
+        messages: List[ChatCompletionRequestMessage],
+        functions: Optional[List[ChatCompletionFunction]] = None,
+        function_call: Optional[ChatCompletionRequestFunctionCall] = None,
+        tools: Optional[List[ChatCompletionTool]] = None,
+        tool_choice: Optional[ChatCompletionToolChoiceOption] = None,
         temperature: float = 0.2,
         top_p: float = 0.95,
         top_k: int = 40,
         min_p: float = 0.05,
         typical_p: float = 1.0,
         stream: bool = False,
-        stop: str | list[str] | None = [],
-        seed: int | None = None,
-        response_format: ChatCompletionRequestResponseFormat | None = None,
-        max_tokens: int | None = None,
+        stop: Optional[Union[str, List[str]]] = [],
+        seed: Optional[int] = None,
+        response_format: Optional[ChatCompletionRequestResponseFormat] = None,
+        max_tokens: Optional[int] = None,
         presence_penalty: float = 0.0,
         frequency_penalty: float = 0.0,
         repeat_penalty: float = 1.0,
@@ -1820,13 +1825,15 @@ def create_chat_completion(
         mirostat_mode: int = 0,
         mirostat_tau: float = 5.0,
         mirostat_eta: float = 0.1,
-        model: str | None = None,
-        logits_processor: LogitsProcessorList | None = None,
-        grammar: LlamaGrammar | None = None,
-        logit_bias: dict[str, float] | None = None,
-        logprobs: bool | None = None,
-        top_logprobs: int | None = None,
-    ) -> CreateChatCompletionResponse | Iterator[CreateChatCompletionStreamResponse]:
+        model: Optional[str] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        grammar: Optional[LlamaGrammar] = None,
+        logit_bias: Optional[Dict[str, float]] = None,
+        logprobs: Optional[bool] = None,
+        top_logprobs: Optional[int] = None,
+    ) -> Union[
+        CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse]
+    ]:
         """Generate a chat completion from a list of messages.
 
         Args:
@@ -1922,12 +1929,12 @@ def create_chat_completion_openai_v1(
             assert isinstance(stream, bool)
             if stream:
                 return (ChatCompletionChunk(**chunk) for chunk in self.create_chat_completion(*args, **kwargs))  # type: ignore
-
-            return ChatCompletion(**self.create_chat_completion(*args, **kwargs))  # type: ignore
+            else:
+                return ChatCompletion(**self.create_chat_completion(*args, **kwargs))  # type: ignore
         except ImportError:
             raise ImportError(
                 "To use create_chat_completion_openai_v1, you must install the openai package."
-                "You can install it with `pip install openai`.",
+                "You can install it with `pip install openai`."
             )
 
     def __getstate__(self):
@@ -2063,19 +2070,19 @@ def pooling_type(self) -> str:
 
     def close(self) -> None:
         """Explicitly free the model from memory."""
-        if hasattr(self, "_stack"):
+        if hasattr(self,'_stack'):
             if self._stack is not None:
                 self._stack.close()
 
     def __del__(self) -> None:
-        if hasattr(self, "_lora_adapter"):
+        if hasattr(self,'_lora_adapter'):
             if self._lora_adapter is not None:
                 llama_cpp.llama_lora_adapter_free(self._lora_adapter)
         self.close()
 
     @staticmethod
     def logits_to_logprobs(
-        logits: npt.NDArray[np.single] | list, axis: int = -1,
+        logits: Union[npt.NDArray[np.single], List], axis: int = -1
     ) -> npt.NDArray[np.single]:
         # https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.log_softmax.html
         logits_maxs: np.ndarray = np.amax(logits, axis=axis, keepdims=True)
@@ -2094,7 +2101,7 @@ def logits_to_logprobs(
     @staticmethod
     def longest_token_prefix(a: Sequence[int], b: Sequence[int]):
         longest_prefix = 0
-        for _a, _b in zip(a, b, strict=False):
+        for _a, _b in zip(a, b):
             if _a == _b:
                 longest_prefix += 1
             else:
@@ -2105,10 +2112,10 @@ def longest_token_prefix(a: Sequence[int], b: Sequence[int]):
     def from_pretrained(
         cls,
         repo_id: str,
-        filename: str | None,
-        local_dir: str | os.PathLike[str] | None = None,
-        local_dir_use_symlinks: bool | Literal["auto"] = "auto",
-        cache_dir: str | os.PathLike[str] | None = None,
+        filename: Optional[str],
+        local_dir: Optional[Union[str, os.PathLike[str]]] = None,
+        local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
+        cache_dir: Optional[Union[str, os.PathLike[str]]] = None,
         **kwargs: Any,
     ) -> Llama:
         """Create a Llama model from a pretrained model name or path.
@@ -2130,7 +2137,7 @@ def from_pretrained(
         except ImportError:
             raise ImportError(
                 "Llama.from_pretrained requires the huggingface-hub package. "
-                "You can install it with `pip install huggingface-hub`.",
+                "You can install it with `pip install huggingface-hub`."
             )
 
         validate_repo_id(repo_id)
@@ -2139,11 +2146,11 @@ def from_pretrained(
 
         files = [
             file["name"] if isinstance(file, dict) else file
-            for file in hffs.ls(repo_id, recursive=True)
+            for file in hffs.ls(repo_id)
         ]
 
         # split each file into repo_id, subfolder, filename
-        file_list: list[str] = []
+        file_list: List[str] = []
         for file in files:
             rel_path = Path(file).relative_to(repo_id)
             file_list.append(str(rel_path))
@@ -2153,13 +2160,13 @@ def from_pretrained(
         if len(matching_files) == 0:
             raise ValueError(
                 f"No file found in {repo_id} that match {filename}\n\n"
-                f"Available Files:\n{json.dumps(file_list)}",
+                f"Available Files:\n{json.dumps(file_list)}"
             )
 
         if len(matching_files) > 1:
             raise ValueError(
                 f"Multiple files found in {repo_id} matching {filename}\n\n"
-                f"Available Files:\n{json.dumps(files)}",
+                f"Available Files:\n{json.dumps(files)}"
             )
 
         (matching_file,) = matching_files
@@ -2213,13 +2220,13 @@ def __init__(
 
 
 LogitsProcessor = Callable[
-    [npt.NDArray[np.intc], npt.NDArray[np.single]], npt.NDArray[np.single],
+    [npt.NDArray[np.intc], npt.NDArray[np.single]], npt.NDArray[np.single]
 ]
 
 
-class LogitsProcessorList(list[LogitsProcessor]):
+class LogitsProcessorList(List[LogitsProcessor]):
     def __call__(
-        self, input_ids: npt.NDArray[np.intc], scores: npt.NDArray[np.single],
+        self, input_ids: npt.NDArray[np.intc], scores: npt.NDArray[np.single]
     ) -> npt.NDArray[np.single]:
         for processor in self:
             scores = processor(input_ids, scores)
@@ -2229,9 +2236,9 @@ def __call__(
 StoppingCriteria = Callable[[npt.NDArray[np.intc], npt.NDArray[np.single]], bool]
 
 
-class StoppingCriteriaList(list[StoppingCriteria]):
+class StoppingCriteriaList(List[StoppingCriteria]):
     def __call__(
-        self, input_ids: npt.NDArray[np.intc], logits: npt.NDArray[np.single],
+        self, input_ids: npt.NDArray[np.intc], logits: npt.NDArray[np.single]
     ) -> bool:
         return any([stopping_criteria(input_ids, logits) for stopping_criteria in self])
 
@@ -2243,7 +2250,7 @@ def __init__(self, min_tokens: int, token_eos: int):
         self.prompt_tokens = None
 
     def __call__(
-        self, input_ids: npt.NDArray[np.intc], scores: npt.NDArray[np.single],
+        self, input_ids: npt.NDArray[np.intc], scores: npt.NDArray[np.single]
     ) -> npt.NDArray[np.single]:
         if self.prompt_tokens is None:
             self.prompt_tokens = len(input_ids)

From e15563ff5bbc2f20d5ac33c37e8523d02ca98218 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 08:21:41 +0200
Subject: [PATCH 159/177] Update app.py

---
 llama_cpp/server/app.py | 74 ++++++++++++++++++++++-------------------
 1 file changed, 40 insertions(+), 34 deletions(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 6afb0ec8b..ec7da0712 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -4,10 +4,9 @@
 import json
 import os
 import typing
-from collections.abc import Iterator
 from functools import partial
 from threading import Lock
-from typing import Dict, List, Optional, Union
+from typing import Dict, Iterator, List, Optional, Union
 
 import anyio
 from anyio.streams.memory import MemoryObjectSendStream
@@ -45,7 +44,7 @@
 
 router = APIRouter(route_class=RouteErrorHandler)
 
-_server_settings: ServerSettings | None = None
+_server_settings: Optional[ServerSettings] = None
 
 
 def set_server_settings(server_settings: ServerSettings):
@@ -57,13 +56,13 @@ def get_server_settings():
     yield _server_settings
 
 
-_llama_proxy: LlamaProxy | None = None
+_llama_proxy: Optional[LlamaProxy] = None
 
 llama_outer_lock = Lock()
 llama_inner_lock = Lock()
 
 
-def set_llama_proxy(model_settings: list[ModelSettings]):
+def set_llama_proxy(model_settings: List[ModelSettings]):
     global _llama_proxy
     _llama_proxy = LlamaProxy(models=model_settings)
 
@@ -87,7 +86,7 @@ def get_llama_proxy():
             llama_outer_lock.release()
 
 
-_ping_message_factory: typing.Callable[[], bytes] | None = None
+_ping_message_factory: typing.Optional[typing.Callable[[], bytes]] = None
 
 
 def set_ping_message_factory(factory: typing.Callable[[], bytes]):
@@ -98,7 +97,7 @@ def set_ping_message_factory(factory: typing.Callable[[], bytes]):
 def create_app(
     settings: Settings | None = None,
     server_settings: ServerSettings | None = None,
-    model_settings: list[ModelSettings] | None = None,
+    model_settings: List[ModelSettings] | None = None,
 ):
     config_file = os.environ.get("CONFIG_FILE", None)
     if config_file is not None:
@@ -110,7 +109,7 @@ def create_app(
                 import yaml
 
                 config_file_settings = ConfigFileSettings.model_validate_json(
-                    json.dumps(yaml.safe_load(f)),
+                    json.dumps(yaml.safe_load(f))
                 )
             else:
                 config_file_settings = ConfigFileSettings.model_validate_json(f.read())
@@ -157,7 +156,7 @@ async def get_event_publisher(
     request: Request,
     inner_send_chan: MemoryObjectSendStream[typing.Any],
     iterator: Iterator[typing.Any],
-    on_complete: typing.Callable[[], None] | None = None,
+    on_complete: typing.Optional[typing.Callable[[], None]] = None,
 ):
     server_settings = next(get_server_settings())
     interrupt_requests = (
@@ -185,9 +184,9 @@ async def get_event_publisher(
 
 def _logit_bias_tokens_to_input_ids(
     llama: llama_cpp.Llama,
-    logit_bias: dict[str, float],
-) -> dict[str, float]:
-    to_bias: dict[str, float] = {}
+    logit_bias: Dict[str, float],
+) -> Dict[str, float]:
+    to_bias: Dict[str, float] = {}
     for token, score in logit_bias.items():
         token = token.encode("utf-8")
         for input_id in llama.tokenize(token, add_bos=False, special=True):
@@ -201,7 +200,7 @@ def _logit_bias_tokens_to_input_ids(
 
 async def authenticate(
     settings: Settings = Depends(get_server_settings),
-    authorization: str | None = Depends(bearer_scheme),
+    authorization: Optional[str] = Depends(bearer_scheme),
 ):
     # Skip API key check if it's not set in settings
     if settings.api_key is None:
@@ -237,10 +236,10 @@ async def authenticate(
                 "application/json": {
                     "schema": {
                         "anyOf": [
-                            {"$ref": "#/components/schemas/CreateCompletionResponse"},
+                            {"$ref": "#/components/schemas/CreateCompletionResponse"}
                         ],
                         "title": "Completion response, when stream=False",
-                    },
+                    }
                 },
                 "text/event-stream": {
                     "schema": {
@@ -248,10 +247,10 @@ async def authenticate(
                         "title": "Server Side Streaming response, when stream=True. "
                         + "See SSE format: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format",
                         "example": """data: {... see CreateCompletionResponse ...} \\n\\n data: ... \\n\\n ... data: [DONE]""",
-                    },
+                    }
                 },
             },
-        },
+        }
     },
     tags=[openai_v1_tag],
 )
@@ -267,7 +266,7 @@ async def create_completion(
 ) -> llama_cpp.Completion:
     exit_stack = contextlib.ExitStack()
     llama_proxy = await run_in_threadpool(
-        lambda: exit_stack.enter_context(contextlib.contextmanager(get_llama_proxy)()),
+        lambda: exit_stack.enter_context(contextlib.contextmanager(get_llama_proxy)())
     )
     if llama_proxy is None:
         raise HTTPException(
@@ -281,7 +280,7 @@ async def create_completion(
     llama = llama_proxy(
         body.model
         if request.url.path != "/v1/engines/copilot-codex/completions"
-        else "copilot-codex",
+        else "copilot-codex"
     )
 
     exclude = {
@@ -305,14 +304,17 @@ async def create_completion(
 
     if body.min_tokens > 0:
         _min_tokens_logits_processor = llama_cpp.LogitsProcessorList(
-            [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())],
+            [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())]
         )
         if "logits_processor" not in kwargs:
             kwargs["logits_processor"] = _min_tokens_logits_processor
         else:
             kwargs["logits_processor"].extend(_min_tokens_logits_processor)
 
-    iterator_or_completion: llama_cpp.CreateCompletionResponse | Iterator[llama_cpp.CreateCompletionStreamResponse] = await run_in_threadpool(llama, **kwargs)
+    iterator_or_completion: Union[
+        llama_cpp.CreateCompletionResponse,
+        Iterator[llama_cpp.CreateCompletionStreamResponse],
+    ] = await run_in_threadpool(llama, **kwargs)
 
     if isinstance(iterator_or_completion, Iterator):
         # EAFP: It's easier to ask for forgiveness than permission
@@ -338,7 +340,8 @@ def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]:
             sep="\n",
             ping_message_factory=_ping_message_factory,
         )
-    return iterator_or_completion
+    else:
+        return iterator_or_completion
 
 
 @router.post(
@@ -370,11 +373,11 @@ async def create_embedding(
                     "schema": {
                         "anyOf": [
                             {
-                                "$ref": "#/components/schemas/CreateChatCompletionResponse",
-                            },
+                                "$ref": "#/components/schemas/CreateChatCompletionResponse"
+                            }
                         ],
                         "title": "Completion response, when stream=False",
-                    },
+                    }
                 },
                 "text/event-stream": {
                     "schema": {
@@ -382,10 +385,10 @@ async def create_embedding(
                         "title": "Server Side Streaming response, when stream=True"
                         + "See SSE format: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format",
                         "example": """data: {... see CreateChatCompletionResponse ...} \\n\\n data: ... \\n\\n ... data: [DONE]""",
-                    },
+                    }
                 },
             },
-        },
+        }
     },
     tags=[openai_v1_tag],
 )
@@ -437,7 +440,7 @@ async def create_chat_completion(
                                     "required": ["name", "age"],
                                 },
                             },
-                        },
+                        }
                     ],
                     "tool_choice": {
                         "type": "function",
@@ -459,7 +462,7 @@ async def create_chat_completion(
                     "top_logprobs": 10,
                 },
             },
-        },
+        }
     ),
 ) -> llama_cpp.ChatCompletion:
     # This is a workaround for an issue in FastAPI dependencies
@@ -468,7 +471,7 @@ async def create_chat_completion(
     # https://github.com/tiangolo/fastapi/issues/11143
     exit_stack = contextlib.ExitStack()
     llama_proxy = await run_in_threadpool(
-        lambda: exit_stack.enter_context(contextlib.contextmanager(get_llama_proxy)()),
+        lambda: exit_stack.enter_context(contextlib.contextmanager(get_llama_proxy)())
     )
     if llama_proxy is None:
         raise HTTPException(
@@ -495,14 +498,16 @@ async def create_chat_completion(
 
     if body.min_tokens > 0:
         _min_tokens_logits_processor = llama_cpp.LogitsProcessorList(
-            [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())],
+            [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())]
         )
         if "logits_processor" not in kwargs:
             kwargs["logits_processor"] = _min_tokens_logits_processor
         else:
             kwargs["logits_processor"].extend(_min_tokens_logits_processor)
 
-    iterator_or_completion: llama_cpp.ChatCompletion | Iterator[llama_cpp.ChatCompletionChunk] = await run_in_threadpool(llama.create_chat_completion, **kwargs)
+    iterator_or_completion: Union[
+        llama_cpp.ChatCompletion, Iterator[llama_cpp.ChatCompletionChunk]
+    ] = await run_in_threadpool(llama.create_chat_completion, **kwargs)
 
     if isinstance(iterator_or_completion, Iterator):
         # EAFP: It's easier to ask for forgiveness than permission
@@ -528,8 +533,9 @@ def iterator() -> Iterator[llama_cpp.ChatCompletionChunk]:
             sep="\n",
             ping_message_factory=_ping_message_factory,
         )
-    exit_stack.close()
-    return iterator_or_completion
+    else:
+        exit_stack.close()
+        return iterator_or_completion
 
 
 @router.get(

From 82bead9f2ea71549df86b9075e4cff3f09cd57ef Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 08:32:42 +0200
Subject: [PATCH 160/177] Update types.py

---
 llama_cpp/server/types.py | 136 +++++++++++++++++++-------------------
 1 file changed, 68 insertions(+), 68 deletions(-)

diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py
index 5943c6613..e95ab11ac 100644
--- a/llama_cpp/server/types.py
+++ b/llama_cpp/server/types.py
@@ -1,18 +1,18 @@
 from __future__ import annotations
 
-from typing import Dict, List, Literal, Optional, Union
+from typing import Dict, List, Optional, Union
 
 from pydantic import BaseModel, Field
-from typing_extensions import TypedDict
+from typing_extensions import Literal, TypedDict
 
 import llama_cpp
 
 model_field = Field(
-    description="The model to use for generating completions.", default=None,
+    description="The model to use for generating completions.", default=None
 )
 
 max_tokens_field = Field(
-    default=16, ge=1, description="The maximum number of tokens to generate.",
+    default=16, ge=1, description="The maximum number of tokens to generate."
 )
 
 min_tokens_field = Field(
@@ -96,7 +96,7 @@
 )
 
 mirostat_eta_field = Field(
-    default=0.1, ge=0.001, le=1.0, description="Mirostat learning rate",
+    default=0.1, ge=0.001, le=1.0, description="Mirostat learning rate"
 )
 
 grammar = Field(
@@ -106,15 +106,15 @@
 
 
 class CreateCompletionRequest(BaseModel):
-    prompt: str | list[str] = Field(
-        default="", description="The prompt to generate completions for.",
+    prompt: Union[str, List[str]] = Field(
+        default="", description="The prompt to generate completions for."
     )
-    suffix: str | None = Field(
+    suffix: Optional[str] = Field(
         default=None,
         description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots.",
     )
-    max_tokens: int | None = Field(
-        default=16, ge=0, description="The maximum number of tokens to generate.",
+    max_tokens: Optional[int] = Field(
+        default=16, ge=0, description="The maximum number of tokens to generate."
     )
     min_tokens: int = min_tokens_field
     temperature: float = temperature_field
@@ -124,32 +124,32 @@ class CreateCompletionRequest(BaseModel):
         default=False,
         description="Whether to echo the prompt in the generated text. Useful for chatbots.",
     )
-    stop: str | list[str] | None = stop_field
+    stop: Optional[Union[str, List[str]]] = stop_field
     stream: bool = stream_field
-    logprobs: int | None = Field(
+    logprobs: Optional[int] = Field(
         default=None,
         ge=0,
         description="The number of logprobs to generate. If None, no logprobs are generated.",
     )
-    presence_penalty: float | None = presence_penalty_field
-    frequency_penalty: float | None = frequency_penalty_field
-    logit_bias: dict[str, float] | None = Field(None)
-    seed: int | None = Field(None)
+    presence_penalty: Optional[float] = presence_penalty_field
+    frequency_penalty: Optional[float] = frequency_penalty_field
+    logit_bias: Optional[Dict[str, float]] = Field(None)
+    seed: Optional[int] = Field(None)
 
     # ignored or currently unsupported
-    model: str | None = model_field
-    n: int | None = 1
-    best_of: int | None = 1
-    user: str | None = Field(default=None)
+    model: Optional[str] = model_field
+    n: Optional[int] = 1
+    best_of: Optional[int] = 1
+    user: Optional[str] = Field(default=None)
 
     # llama.cpp specific parameters
     top_k: int = top_k_field
     repeat_penalty: float = repeat_penalty_field
-    logit_bias_type: Literal["input_ids", "tokens"] | None = Field(None)
+    logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
     mirostat_mode: int = mirostat_mode_field
     mirostat_tau: float = mirostat_tau_field
     mirostat_eta: float = mirostat_eta_field
-    grammar: str | None = None
+    grammar: Optional[str] = None
 
     model_config = {
         "json_schema_extra": {
@@ -157,67 +157,67 @@ class CreateCompletionRequest(BaseModel):
                 {
                     "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
                     "stop": ["\n", "###"],
-                },
-            ],
-        },
+                }
+            ]
+        }
     }
 
 
 class CreateEmbeddingRequest(BaseModel):
-    model: str | None = model_field
-    input: str | list[str] = Field(description="The input to embed.")
-    user: str | None = Field(default=None)
+    model: Optional[str] = model_field
+    input: Union[str, List[str]] = Field(description="The input to embed.")
+    user: Optional[str] = Field(default=None)
 
     model_config = {
         "json_schema_extra": {
             "examples": [
                 {
                     "input": "The food was delicious and the waiter...",
-                },
-            ],
-        },
+                }
+            ]
+        }
     }
 
 
 class ChatCompletionRequestMessage(BaseModel):
     role: Literal["system", "user", "assistant", "function"] = Field(
-        default="user", description="The role of the message.",
+        default="user", description="The role of the message."
     )
-    content: str | None = Field(
-        default="", description="The content of the message.",
+    content: Optional[str] = Field(
+        default="", description="The content of the message."
     )
 
 
 class CreateChatCompletionRequest(BaseModel):
-    messages: list[llama_cpp.ChatCompletionRequestMessage] = Field(
-        default=[], description="A list of messages to generate completions for.",
+    messages: List[llama_cpp.ChatCompletionRequestMessage] = Field(
+        default=[], description="A list of messages to generate completions for."
     )
-    functions: list[llama_cpp.ChatCompletionFunction] | None = Field(
+    functions: Optional[List[llama_cpp.ChatCompletionFunction]] = Field(
         default=None,
         description="A list of functions to apply to the generated completions.",
     )
-    function_call: llama_cpp.ChatCompletionRequestFunctionCall | None = Field(
+    function_call: Optional[llama_cpp.ChatCompletionRequestFunctionCall] = Field(
         default=None,
         description="A function to apply to the generated completions.",
     )
-    tools: list[llama_cpp.ChatCompletionTool] | None = Field(
+    tools: Optional[List[llama_cpp.ChatCompletionTool]] = Field(
         default=None,
         description="A list of tools to apply to the generated completions.",
     )
-    tool_choice: llama_cpp.ChatCompletionToolChoiceOption | None = Field(
+    tool_choice: Optional[llama_cpp.ChatCompletionToolChoiceOption] = Field(
         default=None,
         description="A tool to apply to the generated completions.",
     )  # TODO: verify
-    max_tokens: int | None = Field(
+    max_tokens: Optional[int] = Field(
         default=None,
         description="The maximum number of tokens to generate. Defaults to inf",
     )
     min_tokens: int = min_tokens_field
-    logprobs: bool | None = Field(
+    logprobs: Optional[bool] = Field(
         default=False,
         description="Whether to output the logprobs or not. Default is True",
     )
-    top_logprobs: int | None = Field(
+    top_logprobs: Optional[int] = Field(
         default=None,
         ge=0,
         description="The number of logprobs to generate. If None, no logprobs are generated. logprobs need to set to True.",
@@ -225,29 +225,29 @@ class CreateChatCompletionRequest(BaseModel):
     temperature: float = temperature_field
     top_p: float = top_p_field
     min_p: float = min_p_field
-    stop: str | list[str] | None = stop_field
+    stop: Optional[Union[str, List[str]]] = stop_field
     stream: bool = stream_field
-    presence_penalty: float | None = presence_penalty_field
-    frequency_penalty: float | None = frequency_penalty_field
-    logit_bias: dict[str, float] | None = Field(None)
-    seed: int | None = Field(None)
-    response_format: llama_cpp.ChatCompletionRequestResponseFormat | None = Field(
+    presence_penalty: Optional[float] = presence_penalty_field
+    frequency_penalty: Optional[float] = frequency_penalty_field
+    logit_bias: Optional[Dict[str, float]] = Field(None)
+    seed: Optional[int] = Field(None)
+    response_format: Optional[llama_cpp.ChatCompletionRequestResponseFormat] = Field(
         default=None,
     )
 
     # ignored or currently unsupported
-    model: str | None = model_field
-    n: int | None = 1
-    user: str | None = Field(None)
+    model: Optional[str] = model_field
+    n: Optional[int] = 1
+    user: Optional[str] = Field(None)
 
     # llama.cpp specific parameters
     top_k: int = top_k_field
     repeat_penalty: float = repeat_penalty_field
-    logit_bias_type: Literal["input_ids", "tokens"] | None = Field(None)
+    logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
     mirostat_mode: int = mirostat_mode_field
     mirostat_tau: float = mirostat_tau_field
     mirostat_eta: float = mirostat_eta_field
-    grammar: str | None = None
+    grammar: Optional[str] = None
 
     model_config = {
         "json_schema_extra": {
@@ -255,15 +255,15 @@ class CreateChatCompletionRequest(BaseModel):
                 {
                     "messages": [
                         ChatCompletionRequestMessage(
-                            role="system", content="You are a helpful assistant.",
+                            role="system", content="You are a helpful assistant."
                         ).model_dump(),
                         ChatCompletionRequestMessage(
-                            role="user", content="What is the capital of France?",
+                            role="user", content="What is the capital of France?"
                         ).model_dump(),
-                    ],
-                },
-            ],
-        },
+                    ]
+                }
+            ]
+        }
     }
 
 
@@ -271,25 +271,25 @@ class ModelData(TypedDict):
     id: str
     object: Literal["model"]
     owned_by: str
-    permissions: list[str]
+    permissions: List[str]
 
 
 class ModelList(TypedDict):
     object: Literal["list"]
-    data: list[ModelData]
+    data: List[ModelData]
 
 
 class TokenizeInputRequest(BaseModel):
-    model: str | None = model_field
+    model: Optional[str] = model_field
     input: str = Field(description="The input to tokenize.")
 
     model_config = {
-        "json_schema_extra": {"examples": [{"input": "How many tokens in this query?"}]},
+        "json_schema_extra": {"examples": [{"input": "How many tokens in this query?"}]}
     }
 
 
 class TokenizeInputResponse(BaseModel):
-    tokens: list[int] = Field(description="A list of tokens.")
+    tokens: List[int] = Field(description="A list of tokens.")
 
     model_config = {"json_schema_extra": {"example": {"tokens": [123, 321, 222]}}}
 
@@ -301,8 +301,8 @@ class TokenizeInputCountResponse(BaseModel):
 
 
 class DetokenizeInputRequest(BaseModel):
-    model: str | None = model_field
-    tokens: list[int] = Field(description="A list of toekns to detokenize.")
+    model: Optional[str] = model_field
+    tokens: List[int] = Field(description="A list of toekns to detokenize.")
 
     model_config = {"json_schema_extra": {"example": [{"tokens": [123, 321, 222]}]}}
 
@@ -311,5 +311,5 @@ class DetokenizeInputResponse(BaseModel):
     text: str = Field(description="The detokenized text.")
 
     model_config = {
-        "json_schema_extra": {"example": {"text": "How many tokens in this query?"}},
+        "json_schema_extra": {"example": {"text": "How many tokens in this query?"}}
     }

From 2d9ee84c629cba3633a50c3b63b07d6c6c0b03c4 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 08:33:14 +0200
Subject: [PATCH 161/177] Update errors.py

---
 llama_cpp/server/errors.py | 43 ++++++++++++++++++++++++--------------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/llama_cpp/server/errors.py b/llama_cpp/server/errors.py
index f50154c81..dae9960ba 100644
--- a/llama_cpp/server/errors.py
+++ b/llama_cpp/server/errors.py
@@ -3,9 +3,8 @@
 import sys
 import time
 import traceback
-from collections.abc import Callable, Coroutine
 from re import Match, Pattern, compile
-from typing import Dict, Optional, Tuple, Union
+from typing import Callable, Coroutine, Dict, Optional, Tuple, Union
 
 from fastapi import (
     HTTPException,
@@ -28,8 +27,8 @@ class ErrorResponse(TypedDict):
 
     message: str
     type: str
-    param: str | None
-    code: str | None
+    param: Optional[str]
+    code: Optional[str]
 
 
 class ErrorResponseFormatters:
@@ -46,9 +45,9 @@ class ErrorResponseFormatters:
 
     @staticmethod
     def context_length_exceeded(
-        request: CreateCompletionRequest | CreateChatCompletionRequest,
+        request: Union[CreateCompletionRequest, CreateChatCompletionRequest],
         match,  # type: Match[str] # type: ignore
-    ) -> tuple[int, ErrorResponse]:
+    ) -> Tuple[int, ErrorResponse]:
         """Formatter for context length exceeded error"""
 
         context_window = int(match.group(2))
@@ -84,9 +83,9 @@ def context_length_exceeded(
 
     @staticmethod
     def model_not_found(
-        request: CreateCompletionRequest | CreateChatCompletionRequest,
+        request: Union[CreateCompletionRequest, CreateChatCompletionRequest],
         match,  # type: Match[str] # type: ignore
-    ) -> tuple[int, ErrorResponse]:
+    ) -> Tuple[int, ErrorResponse]:
         """Formatter for model_not_found error"""
 
         model_path = str(match.group(1))
@@ -104,29 +103,35 @@ class RouteErrorHandler(APIRoute):
 
     # key: regex pattern for original error message from llama_cpp
     # value: formatter function
-    pattern_and_formatters: dict[
+    pattern_and_formatters: Dict[
         Pattern[str],
         Callable[
             [
-                CreateCompletionRequest | CreateChatCompletionRequest,
+                Union[CreateCompletionRequest, CreateChatCompletionRequest],
                 Match[str],
             ],
-            tuple[int, ErrorResponse],
+            Tuple[int, ErrorResponse],
         ],
     ] = {
         compile(
-            r"Requested tokens \((\d+)\) exceed context window of (\d+)",
+            r"Requested tokens \((\d+)\) exceed context window of (\d+)"
         ): ErrorResponseFormatters.context_length_exceeded,
         compile(
-            r"Model path does not exist: (.+)",
+            r"Model path does not exist: (.+)"
         ): ErrorResponseFormatters.model_not_found,
     }
 
     def error_message_wrapper(
         self,
         error: Exception,
-        body: CreateChatCompletionRequest | CreateCompletionRequest | CreateEmbeddingRequest | None = None,
-    ) -> tuple[int, ErrorResponse]:
+        body: Optional[
+            Union[
+                CreateChatCompletionRequest,
+                CreateCompletionRequest,
+                CreateEmbeddingRequest,
+            ]
+        ] = None,
+    ) -> Tuple[int, ErrorResponse]:
         """Wraps error message in OpenAI style error response"""
         print(f"Exception: {error!s}", file=sys.stderr)
         traceback.print_exc(file=sys.stderr)
@@ -174,7 +179,13 @@ async def custom_route_handler(request: Request) -> Response:
                 try:
                     if "messages" in json_body:
                         # Chat completion
-                        body: CreateChatCompletionRequest | CreateCompletionRequest | CreateEmbeddingRequest | None = CreateChatCompletionRequest(**json_body)
+                        body: Optional[
+                            Union[
+                                CreateChatCompletionRequest,
+                                CreateCompletionRequest,
+                                CreateEmbeddingRequest,
+                            ]
+                        ] = CreateChatCompletionRequest(**json_body)
                     elif "prompt" in json_body:
                         # Text completion
                         body = CreateCompletionRequest(**json_body)

From e1337367ad89bb48a3533c1338eae8ec9d0f874d Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 08:36:16 +0200
Subject: [PATCH 162/177] Update cli.py

---
 llama_cpp/server/cli.py | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/llama_cpp/server/cli.py b/llama_cpp/server/cli.py
index 763ef73dc..b2b2ac6cd 100644
--- a/llama_cpp/server/cli.py
+++ b/llama_cpp/server/cli.py
@@ -6,34 +6,35 @@
 from pydantic import BaseModel
 
 
-def _get_base_type(annotation: type[Any]) -> type[Any]:
+def _get_base_type(annotation: Type[Any]) -> Type[Any]:
     if getattr(annotation, "__origin__", None) is Literal:
         assert hasattr(annotation, "__args__") and len(annotation.__args__) >= 1  # type: ignore
         return type(annotation.__args__[0])  # type: ignore
-    if getattr(annotation, "__origin__", None) is Union:
+    elif getattr(annotation, "__origin__", None) is Union:
         assert hasattr(annotation, "__args__") and len(annotation.__args__) >= 1  # type: ignore
-        non_optional_args: list[type[Any]] = [
+        non_optional_args: List[Type[Any]] = [
             arg for arg in annotation.__args__ if arg is not type(None)  # type: ignore
         ]
         if non_optional_args:
             return _get_base_type(non_optional_args[0])
     elif (
         getattr(annotation, "__origin__", None) is list
-        or getattr(annotation, "__origin__", None) is list
+        or getattr(annotation, "__origin__", None) is List
     ):
         assert hasattr(annotation, "__args__") and len(annotation.__args__) >= 1  # type: ignore
         return _get_base_type(annotation.__args__[0])  # type: ignore
     return annotation
 
 
-def _contains_list_type(annotation: type[Any] | None) -> bool:
+def _contains_list_type(annotation: Type[Any] | None) -> bool:
     origin = getattr(annotation, "__origin__", None)
 
-    if origin is list or origin is list:
+    if origin is list or origin is List:
         return True
-    if origin in (Literal, Union):
+    elif origin in (Literal, Union):
         return any(_contains_list_type(arg) for arg in annotation.__args__)  # type: ignore
-    return False
+    else:
+        return False
 
 
 def _parse_bool_arg(arg: str | bytes | bool) -> bool:
@@ -47,12 +48,13 @@ def _parse_bool_arg(arg: str | bytes | bool) -> bool:
 
     if arg_str in true_values:
         return True
-    if arg_str in false_values:
+    elif arg_str in false_values:
         return False
-    raise ValueError(f"Invalid boolean argument: {arg}")
+    else:
+        raise ValueError(f"Invalid boolean argument: {arg}")
 
 
-def add_args_from_model(parser: argparse.ArgumentParser, model: type[BaseModel]):
+def add_args_from_model(parser: argparse.ArgumentParser, model: Type[BaseModel]):
     """Add arguments from a pydantic model to an argparse parser."""
 
     for name, field in model.model_fields.items():
@@ -80,7 +82,7 @@ def add_args_from_model(parser: argparse.ArgumentParser, model: type[BaseModel])
             )
 
 
-T = TypeVar("T", bound=type[BaseModel])
+T = TypeVar("T", bound=Type[BaseModel])
 
 
 def parse_model_from_args(model: T, args: argparse.Namespace) -> T:
@@ -90,5 +92,5 @@ def parse_model_from_args(model: T, args: argparse.Namespace) -> T:
             k: v
             for k, v in vars(args).items()
             if v is not None and k in model.model_fields
-        },
+        }
     )

From 5af3b532517036dbf26544434a93c157cd06201d Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 08:37:03 +0200
Subject: [PATCH 163/177] Update model.py

---
 llama_cpp/server/model.py | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index 747343016..afc3f91df 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -9,7 +9,7 @@
 
 
 class LlamaProxy:
-    def __init__(self, models: list[ModelSettings]) -> None:
+    def __init__(self, models: List[ModelSettings]) -> None:
         assert len(models) > 0, "No models provided!"
 
         self._model_settings_dict: dict[str, ModelSettings] = {}
@@ -18,19 +18,19 @@ def __init__(self, models: list[ModelSettings]) -> None:
                 model.model_alias = model.model
             self._model_settings_dict[model.model_alias] = model
 
-        self._current_model: llama_cpp.Llama | None = None
-        self._current_model_alias: str | None = None
+        self._current_model: Optional[llama_cpp.Llama] = None
+        self._current_model_alias: Optional[str] = None
 
         self._default_model_settings: ModelSettings = models[0]
         self._default_model_alias: str = self._default_model_settings.model_alias  # type: ignore
 
         # Load default model
         self._current_model = self.load_llama_from_model_settings(
-            self._default_model_settings,
+            self._default_model_settings
         )
         self._current_model_alias = self._default_model_alias
 
-    def __call__(self, model: str | None = None) -> llama_cpp.Llama:
+    def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama:
         if model is None:
             model = self._default_model_alias
 
@@ -53,7 +53,7 @@ def __call__(self, model: str | None = None) -> llama_cpp.Llama:
     def __getitem__(self, model: str):
         return self._model_settings_dict[model].model_dump()
 
-    def __setitem__(self, model: str, settings: ModelSettings | str | bytes):
+    def __setitem__(self, model: str, settings: Union[ModelSettings, str, bytes]):
         if isinstance(settings, (bytes, str)):
             settings = ModelSettings.model_validate_json(settings)
         self._model_settings_dict[model] = settings
@@ -82,7 +82,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
                 )
             else:
                 chat_handler = llama_cpp.llama_chat_format.Llava15ChatHandler(
-                    clip_model_path=settings.clip_model_path, verbose=settings.verbose,
+                    clip_model_path=settings.clip_model_path, verbose=settings.verbose
                 )
         elif settings.chat_format == "obsidian":
             assert settings.clip_model_path is not None, "clip model not found"
@@ -96,7 +96,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
                 )
             else:
                 chat_handler = llama_cpp.llama_chat_format.ObsidianChatHandler(
-                    clip_model_path=settings.clip_model_path, verbose=settings.verbose,
+                    clip_model_path=settings.clip_model_path, verbose=settings.verbose
                 )
         elif settings.chat_format == "llava-1-6":
             assert settings.clip_model_path is not None, "clip model not found"
@@ -110,7 +110,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
                 )
             else:
                 chat_handler = llama_cpp.llama_chat_format.Llava16ChatHandler(
-                    clip_model_path=settings.clip_model_path, verbose=settings.verbose,
+                    clip_model_path=settings.clip_model_path, verbose=settings.verbose
                 )
         elif settings.chat_format == "moondream":
             assert settings.clip_model_path is not None, "clip model not found"
@@ -124,7 +124,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
                 )
             else:
                 chat_handler = llama_cpp.llama_chat_format.MoondreamChatHandler(
-                    clip_model_path=settings.clip_model_path, verbose=settings.verbose,
+                    clip_model_path=settings.clip_model_path, verbose=settings.verbose
                 )
         elif settings.chat_format == "nanollava":
             assert settings.clip_model_path is not None, "clip model not found"
@@ -138,7 +138,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
                 )
             else:
                 chat_handler = llama_cpp.llama_chat_format.NanoLlavaChatHandler(
-                    clip_model_path=settings.clip_model_path, verbose=settings.verbose,
+                    clip_model_path=settings.clip_model_path, verbose=settings.verbose
                 )
         elif settings.chat_format == "llama-3-vision-alpha":
             assert settings.clip_model_path is not None, "clip model not found"
@@ -152,7 +152,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
                 )
             else:
                 chat_handler = llama_cpp.llama_chat_format.Llama3VisionAlpha(
-                    clip_model_path=settings.clip_model_path, verbose=settings.verbose,
+                    clip_model_path=settings.clip_model_path, verbose=settings.verbose
                 )
         elif settings.chat_format == "hf-autotokenizer":
             assert (
@@ -160,7 +160,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
             ), "hf_pretrained_model_name_or_path must be set for hf-autotokenizer"
             chat_handler = (
                 llama_cpp.llama_chat_format.hf_autotokenizer_to_chat_completion_handler(
-                    settings.hf_pretrained_model_name_or_path,
+                    settings.hf_pretrained_model_name_or_path
                 )
             )
         elif settings.chat_format == "hf-tokenizer-config":
@@ -168,22 +168,22 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
                 settings.hf_tokenizer_config_path is not None
             ), "hf_tokenizer_config_path must be set for hf-tokenizer-config"
             chat_handler = llama_cpp.llama_chat_format.hf_tokenizer_config_to_chat_completion_handler(
-                json.load(open(settings.hf_tokenizer_config_path)),
+                json.load(open(settings.hf_tokenizer_config_path))
             )
 
-        tokenizer: llama_cpp.BaseLlamaTokenizer | None = None
+        tokenizer: Optional[llama_cpp.BaseLlamaTokenizer] = None
         if settings.hf_pretrained_model_name_or_path is not None:
             tokenizer = llama_tokenizer.LlamaHFTokenizer.from_pretrained(
-                settings.hf_pretrained_model_name_or_path,
+                settings.hf_pretrained_model_name_or_path
             )
 
         draft_model = None
         if settings.draft_model is not None:
             draft_model = llama_speculative.LlamaPromptLookupDecoding(
-                num_pred_tokens=settings.draft_model_num_pred_tokens,
+                num_pred_tokens=settings.draft_model_num_pred_tokens
             )
 
-        kv_overrides: dict[str, bool | int | float | str] | None = None
+        kv_overrides: Optional[Dict[str, Union[bool, int, float, str]]] = None
         if settings.kv_overrides is not None:
             assert isinstance(settings.kv_overrides, list)
             kv_overrides = {}

From 4494458ea2c0e57a0294a89ff53625be047ca55f Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 08:37:42 +0200
Subject: [PATCH 164/177] Update settings.py

---
 llama_cpp/server/settings.py | 42 ++++++++++++++++++------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
index 03df095d7..848b93869 100644
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@@ -2,10 +2,10 @@
 
 import multiprocessing
 from typing import Dict, List, Literal, Optional, Union, cast
-from typing_extensions import Self
 
 from pydantic import Field, model_validator
 from pydantic_settings import BaseSettings
+from typing_extensions import Self
 
 import llama_cpp
 
@@ -19,7 +19,7 @@ class ModelSettings(BaseSettings):
     model: str = Field(
         description="The path to the model to use for generating completions.",
     )
-    model_alias: str | None = Field(
+    model_alias: Optional[str] = Field(
         default=None,
         description="The alias of the model to use for generating completions.",
     )
@@ -38,7 +38,7 @@ class ModelSettings(BaseSettings):
         ge=0,
         description="Main GPU to use.",
     )
-    tensor_split: list[float] | None = Field(
+    tensor_split: Optional[List[float]] = Field(
         default=None,
         description="Split layers across multiple GPUs in proportion.",
     )
@@ -53,11 +53,11 @@ class ModelSettings(BaseSettings):
         default=llama_cpp.llama_supports_mlock(),
         description="Use mlock.",
     )
-    kv_overrides: list[str] | None = Field(
+    kv_overrides: Optional[List[str]] = Field(
         default=None,
         description="List of model kv overrides in the format key=type:value where type is one of (bool, int, float). Valid true values are (true, TRUE, 1), otherwise false.",
     )
-    rpc_servers: str | None = Field(
+    rpc_servers: Optional[str] = Field(
         default=None,
         description="comma seperated list of rpc servers for offloading",
     )
@@ -109,25 +109,25 @@ class ModelSettings(BaseSettings):
         description="Last n tokens to keep for repeat penalty calculation.",
     )
     # LoRA Params
-    lora_base: str | None = Field(
+    lora_base: Optional[str] = Field(
         default=None,
         description="Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.",
     )
-    lora_path: str | None = Field(
+    lora_path: Optional[str] = Field(
         default=None,
         description="Path to a LoRA file to apply to the model.",
     )
     # Backend Params
-    numa: bool | int = Field(
+    numa: Union[bool, int] = Field(
         default=False,
         description="Enable NUMA support.",
     )
     # Chat Format Params
-    chat_format: str | None = Field(
+    chat_format: Optional[str] = Field(
         default=None,
         description="Chat format to use.",
     )
-    clip_model_path: str | None = Field(
+    clip_model_path: Optional[str] = Field(
         default=None,
         description="Path to a CLIP model to use for multi-modal chat completion.",
     )
@@ -145,21 +145,21 @@ class ModelSettings(BaseSettings):
         description="The size of the cache in bytes. Only used if cache is True.",
     )
     # Tokenizer Options
-    hf_tokenizer_config_path: str | None = Field(
+    hf_tokenizer_config_path: Optional[str] = Field(
         default=None,
         description="The path to a HuggingFace tokenizer_config.json file.",
     )
-    hf_pretrained_model_name_or_path: str | None = Field(
+    hf_pretrained_model_name_or_path: Optional[str] = Field(
         default=None,
         description="The model name or path to a pretrained HuggingFace tokenizer model. Same as you would pass to AutoTokenizer.from_pretrained().",
     )
     # Loading from HuggingFace Model Hub
-    hf_model_repo_id: str | None = Field(
+    hf_model_repo_id: Optional[str] = Field(
         default=None,
         description="The model repo id to use for the HuggingFace tokenizer model.",
     )
     # Speculative Decoding
-    draft_model: str | None = Field(
+    draft_model: Optional[str] = Field(
         default=None,
         description="Method to use for speculative decoding. One of (prompt-lookup-decoding).",
     )
@@ -168,11 +168,11 @@ class ModelSettings(BaseSettings):
         description="Number of tokens to predict using the draft model.",
     )
     # KV Cache Quantization
-    type_k: int | None = Field(
+    type_k: Optional[int] = Field(
         default=None,
         description="Type of the key cache quantization.",
     )
-    type_v: int | None = Field(
+    type_v: Optional[int] = Field(
         default=None,
         description="Type of the value cache quantization.",
     )
@@ -187,7 +187,7 @@ class ModelSettings(BaseSettings):
     def set_dynamic_defaults(self) -> Self:
         # If n_threads or n_threads_batch is -1, set it to multiprocessing.cpu_count()
         cpu_count = multiprocessing.cpu_count()
-        values = cast(dict[str, int], self)
+        values = cast(Dict[str, int], self)
         if values.get("n_threads", 0) == -1:
             values["n_threads"] = cpu_count
         if values.get("n_threads_batch", 0) == -1:
@@ -201,14 +201,14 @@ class ServerSettings(BaseSettings):
     # Uvicorn Settings
     host: str = Field(default="localhost", description="Listen address")
     port: int = Field(default=8000, description="Listen port")
-    ssl_keyfile: str | None = Field(
+    ssl_keyfile: Optional[str] = Field(
         default=None, description="SSL key file for HTTPS",
     )
-    ssl_certfile: str | None = Field(
+    ssl_certfile: Optional[str] = Field(
         default=None, description="SSL certificate file for HTTPS",
     )
     # FastAPI Settings
-    api_key: str | None = Field(
+    api_key: Optional[str] = Field(
         default=None,
         description="API key for authentication. If set all requests need to be authenticated.",
     )
@@ -233,4 +233,4 @@ class Settings(ServerSettings, ModelSettings):
 class ConfigFileSettings(ServerSettings):
     """Configuration file format settings."""
 
-    models: list[ModelSettings] = Field(default=[], description="Model configs")
+    models: List[ModelSettings] = Field(default=[], description="Model configs")

From 55ba31a59aa72e31ea6e96cd41617443e49e2ffa Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 08:43:18 +0200
Subject: [PATCH 165/177] Update test.yaml

---
 .github/workflows/test.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 78f0b4983..480ad0987 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -14,7 +14,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
 
     steps:
       - uses: actions/checkout@v4
@@ -38,7 +38,7 @@ jobs:
     runs-on: windows-latest
     strategy:
       matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
 
     steps:
       - uses: actions/checkout@v4
@@ -62,7 +62,7 @@ jobs:
     runs-on: macos-latest
     strategy:
       matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
 
     steps:
       - uses: actions/checkout@v4

From 42e15fee22d15352d76a52b6e0c32017e29b86d8 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 08:44:15 +0200
Subject: [PATCH 166/177] Update test.yaml

---
 .github/workflows/test.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 480ad0987..08e5e6657 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -14,7 +14,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13.rc1"]
 
     steps:
       - uses: actions/checkout@v4

From 00d9c9c8d93a5785479480c3b635fb796925085c Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 08:46:18 +0200
Subject: [PATCH 167/177] Update test.yaml

---
 .github/workflows/test.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 08e5e6657..2f3920d07 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -14,7 +14,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13.rc1"]
+        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13.0rc1"]
 
     steps:
       - uses: actions/checkout@v4

From 09d4a2ea1cc3c7b71892adf12652a11950bc8959 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 08:48:03 +0200
Subject: [PATCH 168/177] Update test.yaml

---
 .github/workflows/test.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 2f3920d07..1dff86abc 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -14,7 +14,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13.0rc1"]
+        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13.0-rc.1"]
 
     steps:
       - uses: actions/checkout@v4

From 8614ee7ca0f1b2e6169da1d60975f0a1308cdbab Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 08:48:35 +0200
Subject: [PATCH 169/177] Update test.yaml

---
 .github/workflows/test.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 1dff86abc..7ced2e3f3 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -38,7 +38,7 @@ jobs:
     runs-on: windows-latest
     strategy:
       matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13.0-rc.1"]
 
     steps:
       - uses: actions/checkout@v4
@@ -62,7 +62,7 @@ jobs:
     runs-on: macos-latest
     strategy:
       matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13.0-rc.1"]
 
     steps:
       - uses: actions/checkout@v4

From 022d2db6d4f38b749adec4a2c4e3b531870af765 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 08:54:15 +0200
Subject: [PATCH 170/177] Update test.yaml

---
 .github/workflows/test.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 7ced2e3f3..78f0b4983 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -14,7 +14,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13.0-rc.1"]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
 
     steps:
       - uses: actions/checkout@v4
@@ -38,7 +38,7 @@ jobs:
     runs-on: windows-latest
     strategy:
       matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13.0-rc.1"]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
 
     steps:
       - uses: actions/checkout@v4
@@ -62,7 +62,7 @@ jobs:
     runs-on: macos-latest
     strategy:
       matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13.0-rc.1"]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
 
     steps:
       - uses: actions/checkout@v4

From 8e111ceddce228cc48e18cc55b07627ab65a96f8 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 08:57:19 +0200
Subject: [PATCH 171/177] Update ruff.toml

---
 ruff.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ruff.toml b/ruff.toml
index 672658e2e..f68c8c765 100644
--- a/ruff.toml
+++ b/ruff.toml
@@ -30,7 +30,7 @@ exclude = [
 ]
 
 # Same as Black.
-line-length = 120
+line-length = 80
 indent-width = 4
 
 # Assume Python 3.9
@@ -99,7 +99,7 @@ select = [
    "TD",    # flake8-todos
    "TID",   # flake8-tidy-imports
    "TRY",   # tryceratops
-   "UP",    # pyupgrade
+   # "UP",    # pyupgrade
    "W",     # pycodestyle
    "YTT",   # flake8-2020
 ]

From 34c208727cb262499518803150cf220bf00f8f0a Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 15 Aug 2024 08:58:19 +0200
Subject: [PATCH 172/177] Update ruff.toml

---
 ruff.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ruff.toml b/ruff.toml
index f68c8c765..67644d9c2 100644
--- a/ruff.toml
+++ b/ruff.toml
@@ -96,7 +96,7 @@ select = [
    "T10",   # flake8-debugger
    "T20",   # flake8-print
    "TCH",   # flake8-type-checking
-   "TD",    # flake8-todos
+   # "TD",    # flake8-todos
    "TID",   # flake8-tidy-imports
    "TRY",   # tryceratops
    # "UP",    # pyupgrade

From 28e4dbd6d822e408a48226a4e296a57a5f6d04f4 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Thu, 29 Aug 2024 19:28:27 +0200
Subject: [PATCH 173/177] Update .pre-commit-config.yaml

---
 .pre-commit-config.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 115751835..5f1fffcc7 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -8,7 +8,7 @@ repos:
 
   # ruff
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: "v0.5.7"
+    rev: "v0.6.2"
     hooks:
       # Run the linter.
       - id: ruff
@@ -19,7 +19,7 @@ repos:
         types_or: [ python, pyi, jupyter ]
 
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: "v1.11.1"
+    rev: "v1.11.2"
     hooks:
       - id: mypy
         args: [ '--ignore-missing-imports', '--disable-error-code=top-level-await', "--disable-error-code=empty-body" ]

From 04d2868f7fc601fbffe232ae8bd79b5950a2a642 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 31 Aug 2024 17:47:50 +0200
Subject: [PATCH 174/177] Update .pre-commit-config.yaml

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 5f1fffcc7..cba6eb4b2 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -8,7 +8,7 @@ repos:
 
   # ruff
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: "v0.6.2"
+    rev: "v0.6.3"
     hooks:
       # Run the linter.
       - id: ruff

From 1ac586f1b35f99d38812e5c28fe9bcd019fc7cd8 Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 31 Aug 2024 17:50:47 +0200
Subject: [PATCH 175/177] Update .pre-commit-config.yaml

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index cba6eb4b2..86ca40a38 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,7 +1,7 @@
 repos:
   # auto update
   - repo: https://gitlab.com/vojko.pribudic.foss/pre-commit-update
-    rev: "v0.3.3post1"
+    rev: "v0.4.0post1"
     hooks:
       - id: pre-commit-update
         args: [--dry-run, --all-versions]

From 6fd538c3c2c239c4cff455ea929f5f6e86bf20fa Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sat, 7 Sep 2024 17:20:30 +0200
Subject: [PATCH 176/177] Update .pre-commit-config.yaml

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 86ca40a38..91c5f3f49 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -8,7 +8,7 @@ repos:
 
   # ruff
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: "v0.6.3"
+    rev: "v0.6.4"
     hooks:
       # Run the linter.
       - id: ruff

From 2dd0fced280c67a736465bdf73f58731903d0b5d Mon Sep 17 00:00:00 2001
From: Olivier DEBAUCHE <olivier.debauche@uliege.be>
Date: Sun, 22 Sep 2024 07:42:51 +0200
Subject: [PATCH 177/177] Update .pre-commit-config.yaml

---
 .pre-commit-config.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 91c5f3f49..84c7865fa 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,14 +1,14 @@
 repos:
   # auto update
   - repo: https://gitlab.com/vojko.pribudic.foss/pre-commit-update
-    rev: "v0.4.0post1"
+    rev: "v0.5.0"
     hooks:
       - id: pre-commit-update
         args: [--dry-run, --all-versions]
 
   # ruff
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: "v0.6.4"
+    rev: "v0.6.7"
     hooks:
       # Run the linter.
       - id: ruff