From ba39a6c7464f007b926621065b1df372ea0f405c Mon Sep 17 00:00:00 2001 From: "serhii.n" Date: Fri, 13 Jun 2025 18:48:31 +0300 Subject: [PATCH 1/3] Remove llama_kv_cache_view due it was deleted on llama.cpp side too rel: https://github.com/ggml-org/llama.cpp/pull/13653 --- llama_cpp/llama_cpp.py | 109 ----------------------------------------- vendor/llama.cpp | 2 +- 2 files changed, 1 insertion(+), 110 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 63de3a93a..01e78b710 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -1742,115 +1742,6 @@ def llama_apply_adapter_cvec( # // -# // Information associated with an individual cell in the KV cache view. -# struct llama_kv_cache_view_cell { -# // The position for this cell. Takes KV cache shifts into account. -# // May be negative if the cell is not populated. -# llama_pos pos; -# }; -class llama_kv_cache_view_cell(ctypes.Structure): - """Information associated with an individual cell in the KV cache view. - - Attributes: - pos (llama_pos): The position for this cell. Takes KV cache shifts into account. - May be negative if the cell is not populated.""" - - if TYPE_CHECKING: - pos: llama_pos - - _fields_ = [("pos", llama_pos)] - - -# // An updateable view of the KV cache. -# struct llama_kv_cache_view { -# // Number of KV cache cells. This will be the same as the context size. -# int32_t n_cells; - -# // Maximum number of sequences that can exist in a cell. It's not an error -# // if there are more sequences in a cell than this value, however they will -# // not be visible in the view cells_sequences. -# int32_t n_seq_max; - -# // Number of tokens in the cache. For example, if there are two populated -# // cells, the first with 1 sequence id in it and the second with 2 sequence -# // ids then you'll have 3 tokens. -# int32_t token_count; - -# // Number of populated cache cells. -# int32_t used_cells; - -# // Maximum contiguous empty slots in the cache. -# int32_t max_contiguous; - -# // Index to the start of the max_contiguous slot range. Can be negative -# // when cache is full. -# int32_t max_contiguous_idx; - -# // Information for an individual cell. -# struct llama_kv_cache_view_cell * cells; - - -# // The sequences for each cell. There will be n_seq_max items per cell. -# llama_seq_id * cells_sequences; -# }; -class llama_kv_cache_view(ctypes.Structure): - if TYPE_CHECKING: - n_cells: int - n_max_seq: int - token_count: int - used_cells: int - max_contiguous: int - max_contiguous_idx: int - cells: CtypesArray[llama_kv_cache_view_cell] - cells_sequences: CtypesArray[llama_seq_id] - - _fields_ = [ - ("n_cells", ctypes.c_int32), - ("n_max_seq", ctypes.c_int32), - ("token_count", ctypes.c_int32), - ("used_cells", ctypes.c_int32), - ("max_contiguous", ctypes.c_int32), - ("max_contiguous_idx", ctypes.c_int32), - ("cells", ctypes.POINTER(llama_kv_cache_view_cell)), - ("cells_sequences", ctypes.POINTER(llama_seq_id)), - ] - - -llama_kv_cache_view_p = ctypes.POINTER(llama_kv_cache_view) - - -# // Create an empty KV cache view. (use only for debugging purposes) -# LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max); -@ctypes_function( - "llama_kv_cache_view_init", - [llama_context_p_ctypes, ctypes.c_int32], - llama_kv_cache_view, -) -def llama_kv_cache_view_init( - ctx: llama_context_p, n_seq_max: Union[ctypes.c_int32, int], / -) -> llama_kv_cache_view: - """Create an empty KV cache view. (use only for debugging purposes)""" - ... - - -# // Free a KV cache view. (use only for debugging purposes) -# LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view); -@ctypes_function("llama_kv_cache_view_free", [llama_kv_cache_view_p], None) -def llama_kv_cache_view_free(view: "ctypes.pointer[llama_kv_cache_view]", /): # type: ignore - """Free a KV cache view. (use only for debugging purposes)""" - ... - - -# // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes) -# LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view); -@ctypes_function( - "llama_kv_cache_view_update", [llama_context_p_ctypes, llama_kv_cache_view_p], None -) -def llama_kv_cache_view_update(ctx: llama_context_p, view: CtypesPointerOrRef[llama_kv_cache_view], /): # type: ignore - """Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)""" - ... - - # // Returns the number of tokens in the KV cache (slow, use only for debug) # // If a KV cell has multiple sequences assigned to it, it will be counted multiple times # LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx); diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 8733e0cf6..26ff3685b 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 8733e0cf6eefc7c7752297cc22d0836706f4222c +Subproject commit 26ff3685bfbaa4c8838d7afd988b17dd5eb99f92 From 15cc7e213fd5e7077ab303bd502a3c8ba783a52a Mon Sep 17 00:00:00 2001 From: "serhii.n" Date: Fri, 13 Jun 2025 19:03:20 +0300 Subject: [PATCH 2/3] Remove deprecations were deleted on llama.cpp side too rel: https://github.com/ggml-org/llama.cpp/pull/13653 --- examples/notebooks/Batching.ipynb | 2 +- llama_cpp/_internals.py | 10 +- llama_cpp/llama.py | 4 +- llama_cpp/llama_cpp.py | 176 +----------------------------- 4 files changed, 12 insertions(+), 180 deletions(-) diff --git a/examples/notebooks/Batching.ipynb b/examples/notebooks/Batching.ipynb index be7fe9b52..b1992e9d0 100644 --- a/examples/notebooks/Batching.ipynb +++ b/examples/notebooks/Batching.ipynb @@ -230,7 +230,7 @@ "outputs": [], "source": [ "for i in range(n_parallel):\n", - " llama_cpp.llama_kv_cache_seq_cp(ctx, 0, i, 0, batch.n_tokens)" + " llama_cpp.llama_kv_self_seq_cp(ctx, 0, i, 0, batch.n_tokens)" ] }, { diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 343581dce..67ff1ab95 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -277,19 +277,19 @@ def pooling_type(self) -> int: return llama_cpp.llama_pooling_type(self.ctx) def kv_cache_clear(self): - llama_cpp.llama_kv_cache_clear(self.ctx) + llama_cpp.llama_kv_self_clear(self.ctx) def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int): - llama_cpp.llama_kv_cache_seq_rm(self.ctx, seq_id, p0, p1) + llama_cpp.llama_kv_self_seq_rm(self.ctx, seq_id, p0, p1) def kv_cache_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int): - llama_cpp.llama_kv_cache_seq_cp(self.ctx, seq_id_src, seq_id_dst, p0, p1) + llama_cpp.llama_kv_self_seq_cp(self.ctx, seq_id_src, seq_id_dst, p0, p1) def kv_cache_seq_keep(self, seq_id: int): - llama_cpp.llama_kv_cache_seq_keep(self.ctx, seq_id) + llama_cpp.llama_kv_self_seq_keep(self.ctx, seq_id) def kv_cache_seq_shift(self, seq_id: int, p0: int, p1: int, shift: int): - llama_cpp.llama_kv_cache_seq_add(self.ctx, seq_id, p0, p1, shift) + llama_cpp.llama_kv_self_seq_add(self.ctx, seq_id, p0, p1, shift) def get_state_size(self) -> int: return llama_cpp.llama_get_state_size(self.ctx) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 7e9a6af23..f61503ac4 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1039,7 +1039,7 @@ def embed( data: Union[List[List[float]], List[List[List[float]]]] = [] def decode_batch(seq_sizes: List[int]): - llama_cpp.llama_kv_cache_clear(self._ctx.ctx) + llama_cpp.llama_kv_self_clear(self._ctx.ctx) self._ctx.decode(self._batch) self._batch.reset() @@ -1110,7 +1110,7 @@ def decode_batch(seq_sizes: List[int]): output = data[0] if isinstance(input, str) else data - llama_cpp.llama_kv_cache_clear(self._ctx.ctx) + llama_cpp.llama_kv_self_clear(self._ctx.ctx) self.reset() if return_count: diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 01e78b710..055936499 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -1755,18 +1755,6 @@ def llama_kv_self_n_tokens(ctx: llama_context_p, /) -> int: ... -# DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx), -# "use llama_kv_self_n_tokens instead"); -@ctypes_function( - "llama_get_kv_cache_token_count", [llama_context_p_ctypes], ctypes.c_int32 -) -def llama_get_kv_cache_token_count(ctx: llama_context_p, /) -> int: - """Returns the number of tokens in the KV cache (slow, use only for debug) - If a KV cell has multiple sequences assigned to it, it will be counted multiple times - """ - ... - - # // Returns the number of used KV cells (i.e. have at least one sequence assigned to them) # LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx); @ctypes_function( @@ -1777,16 +1765,6 @@ def llama_kv_self_used_cells(ctx: llama_context_p, /) -> int: ... -# DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx), -# "use llama_kv_self_used_cells instead"); -@ctypes_function( - "llama_get_kv_cache_used_cells", [llama_context_p_ctypes], ctypes.c_int32 -) -def llama_get_kv_cache_used_cells(ctx: llama_context_p, /) -> int: - """Returns the number of used KV cells (i.e. have at least one sequence assigned to them)""" - ... - - # // Clear the KV cache - both cell info is erased and KV data is zeroed # LLAMA_API void llama_kv_self_clear( # struct llama_context * ctx); @@ -1797,25 +1775,18 @@ def llama_kv_self_clear(ctx: llama_context_p, /): """Clear the KV cache - both cell info is erased and KV data is zeroed""" ... -# NOTE: Deprecated -@ctypes_function("llama_kv_self_clear", [llama_context_p_ctypes], None) -def llama_kv_cache_clear(ctx: llama_context_p, /): - """Clear the KV cache""" - ... - - # // Removes all tokens that belong to the specified sequence and have positions in [p0, p1) # // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails # // seq_id < 0 : match any sequence # // p0 < 0 : [0, p1] # // p1 < 0 : [p0, inf) -# LLAMA_API bool llama_kv_cache_seq_rm( +# LLAMA_API bool llama_kv_self_seq_rm( # struct llama_context * ctx, # llama_seq_id seq_id, # llama_pos p0, # llama_pos p1); @ctypes_function( - "llama_kv_cache_seq_rm", + "llama_kv_self_seq_rm", [ llama_context_p_ctypes, llama_seq_id, @@ -1824,7 +1795,7 @@ def llama_kv_cache_clear(ctx: llama_context_p, /): ], ctypes.c_bool, ) -def llama_kv_cache_seq_rm( +def llama_kv_self_seq_rm( ctx: llama_context_p, seq_id: Union[llama_seq_id, int], p0: Union[llama_pos, int], @@ -1840,7 +1811,6 @@ def llama_kv_cache_seq_rm( p1 < 0 : [p0, inf)""" ... - # // Copy all tokens that belong to the specified sequence to another sequence # // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence # // p0 < 0 : [0, p1] @@ -1877,33 +1847,6 @@ def llama_kv_self_seq_cp( ... -# NOTE: Deprecated -@ctypes_function( - "llama_kv_self_seq_cp", - [ - llama_context_p_ctypes, - llama_seq_id, - llama_seq_id, - llama_pos, - llama_pos, - ], - None, -) -def llama_kv_cache_seq_cp( - ctx: llama_context_p, - seq_id_src: Union[llama_seq_id, int], - seq_id_dst: Union[llama_seq_id, int], - p0: Union[llama_pos, int], - p1: Union[llama_pos, int], - /, -): - """Copy all tokens that belong to the specified sequence to another sequence - Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence - p0 < 0 : [0, p1] - p1 < 0 : [p0, inf)""" - ... - - # // Removes all tokens that do not belong to the specified sequence # LLAMA_API void llama_kv_self_seq_keep( # struct llama_context * ctx, @@ -1916,13 +1859,6 @@ def llama_kv_self_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int ... -# NOTE: Deprecated -@ctypes_function( - "llama_kv_self_seq_keep", [llama_context_p_ctypes, llama_seq_id], None -) -def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /): - """Removes all tokens that do not belong to the specified sequence""" - ... @@ -1932,7 +1868,7 @@ def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, in # // - explicitly with llama_kv_cache_update() # // p0 < 0 : [0, p1] # // p1 < 0 : [p0, inf) -# LLAMA_API void llama_kv_cache_seq_add( +# LLAMA_API void llama_kv_self_seq_add( # struct llama_context * ctx, # llama_seq_id seq_id, # llama_pos p0, @@ -1964,49 +1900,6 @@ def llama_kv_self_seq_add( p0 < 0 : [0, p1] p1 < 0 : [p0, inf)""" ... - - -# // NOTE: Deprecated -# // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1) -# // If the KV cache is RoPEd, the KV data is updated accordingly: -# // - lazily on next llama_decode() -# // - explicitly with llama_kv_cache_update() -# // p0 < 0 : [0, p1] -# // p1 < 0 : [p0, inf) -# LLAMA_API void llama_kv_cache_seq_add( -# struct llama_context * ctx, -# llama_seq_id seq_id, -# llama_pos p0, -# llama_pos p1, -# llama_pos delta); -@ctypes_function( - "llama_kv_self_seq_add", - [ - llama_context_p_ctypes, - llama_seq_id, - llama_pos, - llama_pos, - llama_pos, - ], - None, -) -def llama_kv_cache_seq_add( - ctx: llama_context_p, - seq_id: Union[llama_seq_id, int], - p0: Union[llama_pos, int], - p1: Union[llama_pos, int], - delta: Union[llama_pos, int], - /, -): - """Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1) - If the KV cache is RoPEd, the KV data is updated accordingly: - - lazily on next llama_decode() - - explicitly with llama_kv_cache_update() - p0 < 0 : [0, p1] - p1 < 0 : [p0, inf)""" - ... - - # // Integer division of the positions by factor of `d > 1` # // If the KV cache is RoPEd, the KV data is updated accordingly # // p0 < 0 : [0, p1] @@ -2043,43 +1936,6 @@ def llama_kv_self_seq_div( ... -# // NOTE: Deprecated -# // Integer division of the positions by factor of `d > 1` -# // If the KV cache is RoPEd, the KV data is updated accordingly -# // p0 < 0 : [0, p1] -# // p1 < 0 : [p0, inf) -# LLAMA_API void llama_kv_cache_seq_div( -# struct llama_context * ctx, -# llama_seq_id seq_id, -# llama_pos p0, -# llama_pos p1, -# int d); -@ctypes_function( - "llama_kv_self_seq_div", - [ - llama_context_p_ctypes, - llama_seq_id, - llama_pos, - llama_pos, - ctypes.c_int, - ], - None, -) -def llama_kv_cache_seq_div( - ctx: llama_context_p, - seq_id: Union[llama_seq_id, int], - p0: Union[llama_pos, int], - p1: Union[llama_pos, int], - d: Union[ctypes.c_int, int], - /, -): - """Integer division of the positions by factor of `d > 1` - If the KV cache is RoPEd, the KV data is updated accordingly - p0 < 0 : [0, p1] - p1 < 0 : [p0, inf)""" - ... - - # // Returns the largest position present in the KV cache for the specified sequence # LLAMA_API llama_pos llama_kv_self_seq_pos_max( # struct llama_context * ctx, @@ -2108,21 +1964,6 @@ def llama_kv_self_defrag(ctx: llama_context_p, /): ... -# NOTE: Deprecated -# // Defragment the KV cache -# // This will be applied: -# // - lazily on next llama_decode() -# // - explicitly with llama_kv_self_update() -# LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx); -@ctypes_function("llama_kv_cache_defrag", [llama_context_p_ctypes], None) -def llama_kv_cache_defrag(ctx: llama_context_p, /): - """Defragment the KV cache - This will be applied: - - lazily on next llama_decode() - - explicitly with llama_kv_cache_update()""" - ... - - # // Apply the KV cache updates (such as K-shifts, defragmentation, etc.) # LLAMA_API void llama_kv_cache_update(struct llama_context * ctx); @ctypes_function("llama_kv_self_update", [llama_context_p_ctypes], None) @@ -2147,15 +1988,6 @@ def llama_kv_self_can_shift(ctx: llama_context_p, /) -> bool: ... -# // NOTE: Deprecated -# // Check if the context supports KV cache shifting -# LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx); -@ctypes_function("llama_kv_self_can_shift", [llama_context_p_ctypes], ctypes.c_bool) -def llama_kv_cache_can_shift(ctx: llama_context_p, /) -> bool: - """Check if the context supports KV cache shifting""" - ... - - # // # // State / sessions # // From fc4d0f270acc5cbfb95e75ed91ae95df5e1673cc Mon Sep 17 00:00:00 2001 From: "serhii.n" Date: Sat, 14 Jun 2025 15:45:40 +0300 Subject: [PATCH 3/3] Revert llama.cpp to 8733e0cf6eefc7c7752297cc22d0836706f4222c --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 26ff3685b..8733e0cf6 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 26ff3685bfbaa4c8838d7afd988b17dd5eb99f92 +Subproject commit 8733e0cf6eefc7c7752297cc22d0836706f4222c