Merge branch 'main' of https://github.com/abetlen/llama-cpp-python into main

abetlen · abetlen · commit 3fe8e9a8f3e3 · 2024-05-12T10:30:24.000-04:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.2.73]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@25c6e82e7a1ad25a42b0894e87d9b5c557409516
+- fix: Clear kv cache at beginning of image chat formats to avoid bug when image is evaluated first by @abetlen in ac55d0a175115d1e719672ce1cb1bec776c738b1
+
 ## [0.2.72]
 
 - fix(security): Remote Code Execution by Server-Side Template Injection in Model Metadata by @retr0reg in b454f40a9a1787b2b5659cd2cb00819d983185df
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -51,8 +51,9 @@ if (LLAMA_BUILD)
     )
 
     if (LLAVA_BUILD)
-        if (LLAMA_CUBLAS)
+        if (LLAMA_CUBLAS OR LLAMA_CUDA)
             add_compile_definitions(GGML_USE_CUBLAS)
+            add_compile_definitions(GGML_USE_CUDA)
         endif()
 
         if (LLAMA_METAL)
diff --git a/Makefile b/Makefile
@@ -16,7 +16,7 @@ build.debug:
 	CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Debug" python3 -m pip install --verbose --config-settings=cmake.verbose=true --config-settings=logging.level=INFO --config-settings=install.strip=false  --editable .
 
 build.cuda:
-	CMAKE_ARGS="-DLLAMA_CUBLAS=on" python3 -m pip install --verbose -e .
+	CMAKE_ARGS="-DLLAMA_CUDA=on" python3 -m pip install --verbose -e .
 
 build.opencl:
 	CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install --verbose -e .
diff --git a/README.md b/README.md
@@ -550,7 +550,7 @@ llm = Llama.from_pretrained(
   n_ctx=2048, # n_ctx should be increased to accommodate the image embedding
 )
 
-respoonse = llm.create_chat_completion(
+response = llm.create_chat_completion(
     messages = [
         {
             "role": "user",
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.2.72"
+__version__ = "0.2.73"
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
@@ -2637,6 +2637,7 @@ def embed_image_bytes(image_bytes: bytes):
 
         # Evaluate prompt
         llama.reset()
+        llama._ctx.kv_cache_clear()
         for type_, value in split_text:
             if type_ == "text":
                 tokens = llama.tokenize(value.encode("utf8"), add_bos=False, special=True)

Original file line number	Diff line number	Diff line change
`@@ -51,8 +51,9 @@ if (LLAMA_BUILD)`
`51`	`51`	`)`
`52`	`52`
`53`	`53`	`if (LLAVA_BUILD)`
`54`		`- if (LLAMA_CUBLAS)`
	`54`	`+ if (LLAMA_CUBLAS OR LLAMA_CUDA)`
`55`	`55`	`add_compile_definitions(GGML_USE_CUBLAS)`
	`56`	`+ add_compile_definitions(GGML_USE_CUDA)`
`56`	`57`	`endif()`
`57`	`58`
`58`	`59`	`if (LLAMA_METAL)`
Original file line number	Diff line number	Diff line change
`@@ -550,7 +550,7 @@ llm = Llama.from_pretrained(`
`550`	`550`	`n_ctx=2048, # n_ctx should be increased to accommodate the image embedding`
`551`	`551`	`)`
`552`	`552`
`553`		`-respoonse = llm.create_chat_completion(`
	`553`	`+response = llm.create_chat_completion(`
`554`	`554`	`messages = [`
`555`	`555`	`{`
`556`	`556`	`"role": "user",`