Skip to content

Commit f8fd440

Browse files
committed
Merge branch 'master' into xsn/qwen3_embd_rerank
2 parents 3f3b9a2 + 1caae7f commit f8fd440

15 files changed

+374
-225
lines changed

ci/run.sh

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,20 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
4646
fi
4747

4848
if [ ! -z ${GG_BUILD_CUDA} ]; then
49-
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native"
49+
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON"
50+
51+
if command -v nvidia-smi >/dev/null 2>&1; then
52+
CUDA_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits 2>/dev/null | head -1 | tr -d '.')
53+
if [[ -n "$CUDA_ARCH" && "$CUDA_ARCH" =~ ^[0-9]+$ ]]; then
54+
CMAKE_EXTRA="${CMAKE_EXTRA} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH}"
55+
else
56+
echo "Warning: Using fallback CUDA architectures"
57+
CMAKE_EXTRA="${CMAKE_EXTRA} -DCMAKE_CUDA_ARCHITECTURES=61;70;75;80;86;89"
58+
fi
59+
else
60+
echo "Error: nvidia-smi not found, cannot build with CUDA"
61+
exit 1
62+
fi
5063
fi
5164

5265
if [ ! -z ${GG_BUILD_SYCL} ]; then

convert_hf_to_gguf.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3767,8 +3767,7 @@ def set_gguf_parameters(self):
37673767
self._try_set_pooling_type()
37683768

37693769
if self.cls_out_labels:
3770-
key_name = gguf.Keys.Classifier.OUTPUT_LABELS.format(arch = gguf.MODEL_ARCH_NAMES[self.model_arch])
3771-
self.gguf_writer.add_array(key_name, [v for k, v in sorted(self.cls_out_labels.items())])
3770+
self.gguf_writer.add_classifier_output_labels([v for k, v in sorted(self.cls_out_labels.items())])
37723771

37733772
def set_vocab(self):
37743773
tokens, toktypes, tokpre = self.get_vocab_base()

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,7 @@ enum vk_device_architecture {
196196
AMD_RDNA1,
197197
AMD_RDNA2,
198198
AMD_RDNA3,
199+
INTEL_XE2,
199200
};
200201

201202
static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& device) {
@@ -246,6 +247,34 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice&
246247
}
247248
return vk_device_architecture::AMD_RDNA2;
248249
}
250+
} else if (props.vendorID == VK_VENDOR_ID_INTEL) {
251+
const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties();
252+
253+
bool subgroup_size_control = false;
254+
255+
for (const auto& properties : ext_props) {
256+
if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) {
257+
subgroup_size_control = true;
258+
}
259+
}
260+
261+
if (!subgroup_size_control) {
262+
return vk_device_architecture::OTHER;
263+
}
264+
265+
vk::PhysicalDeviceProperties2 props2;
266+
vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props;
267+
268+
props2.pNext = &subgroup_size_control_props;
269+
device.getProperties2(&props2);
270+
271+
if (subgroup_size_control_props.minSubgroupSize == 16) {
272+
// Xe2 architecture uses SIMD16 while previous Xe and Gen architecture uses SIMD8.
273+
// Minimum subgroup size matches the SIMD width so we distinguish architecture by checking this value.
274+
// https://www.intel.com/content/www/us/en/content-details/824434/2024-intel-tech-tour-xe2-and-lunar-lake-s-gpu.html
275+
// https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html
276+
return vk_device_architecture::INTEL_XE2;
277+
}
249278
}
250279
return vk_device_architecture::OTHER;
251280
}
@@ -10263,8 +10292,9 @@ static bool ggml_vk_instance_portability_enumeration_ext_available(const std::ve
1026310292
static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch) {
1026410293
switch (props.vendorID) {
1026510294
case VK_VENDOR_ID_INTEL:
10266-
// Intel drivers don't support coopmat properly yet
10267-
return false;
10295+
// Only allowing Xe2 GPU at the moment since Xe2 GPU can gain significant performance boost,
10296+
// while some older hardware (ex. Arc A770) has performance regressions
10297+
return arch == vk_device_architecture::INTEL_XE2;
1026810298
case VK_VENDOR_ID_AMD:
1026910299
if (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource) {
1027010300
// Workaround for AMD proprietary driver reporting support on all GPUs

gguf-py/gguf/gguf_writer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -935,6 +935,9 @@ def add_eot_token_id(self, id: int) -> None:
935935
def add_eom_token_id(self, id: int) -> None:
936936
self.add_uint32(Keys.Tokenizer.EOM_ID, id)
937937

938+
def add_classifier_output_labels(self, labels: Sequence[str]) -> None:
939+
self.add_array(Keys.Classifier.OUTPUT_LABELS.format(arch=self.arch), labels)
940+
938941
# for vision models
939942

940943
def add_clip_has_vision_encoder(self, value: bool) -> None:

include/llama.h

Lines changed: 88 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,10 @@ extern "C" {
6161
struct llama_model;
6262
struct llama_context;
6363
struct llama_sampler;
64-
struct llama_kv_cache;
64+
65+
typedef struct llama_memory_i * llama_memory_t;
66+
67+
struct llama_kv_cache; // DEPRECATED (use llama_memory instead)
6568

6669
typedef int32_t llama_pos;
6770
typedef int32_t llama_token;
@@ -493,9 +496,11 @@ extern "C" {
493496
DEPRECATED(LLAMA_API int32_t llama_n_vocab (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
494497

495498
LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx);
496-
LLAMA_API struct llama_kv_cache * llama_get_kv_self ( struct llama_context * ctx);
499+
LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx);
497500
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
498501

502+
DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead");
503+
499504
LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
500505
LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
501506

@@ -609,7 +614,78 @@ extern "C" {
609614
int32_t il_end);
610615

611616
//
612-
// KV cache
617+
// Memory
618+
//
619+
620+
// Clear the memory contents
621+
LLAMA_API void llama_memory_clear(llama_memory_t mem);
622+
623+
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
624+
// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
625+
// seq_id < 0 : match any sequence
626+
// p0 < 0 : [0, p1]
627+
// p1 < 0 : [p0, inf)
628+
LLAMA_API bool llama_memory_seq_rm(
629+
llama_memory_t mem,
630+
llama_seq_id seq_id,
631+
llama_pos p0,
632+
llama_pos p1);
633+
634+
// Copy all tokens that belong to the specified sequence to another sequence
635+
// p0 < 0 : [0, p1]
636+
// p1 < 0 : [p0, inf)
637+
LLAMA_API void llama_memory_seq_cp(
638+
llama_memory_t mem,
639+
llama_seq_id seq_id_src,
640+
llama_seq_id seq_id_dst,
641+
llama_pos p0,
642+
llama_pos p1);
643+
644+
// Removes all tokens that do not belong to the specified sequence
645+
LLAMA_API void llama_memory_seq_keep(
646+
llama_memory_t mem,
647+
llama_seq_id seq_id);
648+
649+
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
650+
// p0 < 0 : [0, p1]
651+
// p1 < 0 : [p0, inf)
652+
LLAMA_API void llama_memory_seq_add(
653+
llama_memory_t mem,
654+
llama_seq_id seq_id,
655+
llama_pos p0,
656+
llama_pos p1,
657+
llama_pos delta);
658+
659+
// Integer division of the positions by factor of `d > 1`
660+
// p0 < 0 : [0, p1]
661+
// p1 < 0 : [p0, inf)
662+
LLAMA_API void llama_memory_seq_div(
663+
llama_memory_t mem,
664+
llama_seq_id seq_id,
665+
llama_pos p0,
666+
llama_pos p1,
667+
int d);
668+
669+
// Returns the smallest position present in the memory for the specified sequence
670+
// This is typically non-zero only for SWA caches
671+
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
672+
// Return -1 if the sequence is empty
673+
LLAMA_API llama_pos llama_memory_seq_pos_min(
674+
llama_memory_t mem,
675+
llama_seq_id seq_id);
676+
677+
// Returns the largest position present in the memory for the specified sequence
678+
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
679+
// Return -1 if the sequence is empty
680+
LLAMA_API llama_pos llama_memory_seq_pos_max(
681+
llama_memory_t mem,
682+
llama_seq_id seq_id);
683+
684+
// Check if the memory supports shifting
685+
LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
686+
687+
//
688+
// KV cache for self-attention (TODO: deprecate in favor of llama_memory)
613689
//
614690

615691
// Returns the number of tokens in the KV cache (slow, use only for debug)
@@ -623,7 +699,7 @@ extern "C" {
623699

624700
// Clear the KV cache - both cell info is erased and KV data is zeroed
625701
LLAMA_API void llama_kv_self_clear(
626-
struct llama_context * ctx);
702+
struct llama_context * ctx);
627703

628704
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
629705
// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
@@ -694,22 +770,22 @@ extern "C" {
694770
// Defragment the KV cache
695771
// This will be applied:
696772
// - lazily on next llama_decode()
697-
LLAMA_API DEPRECATED(void llama_kv_self_defrag(struct llama_context * ctx),
773+
DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
698774
"simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
699775

700776
// Check if the context supports KV cache shifting
701777
LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx);
702778

703779
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
704-
LLAMA_API DEPRECATED(void llama_kv_self_update(struct llama_context * ctx),
780+
DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
705781
"simply remove this call, updates are applied lazily on the next llama_decode()");
706782

707783
//
708784
// State / sessions
709785
//
710786

711787
// Returns the *actual* size in bytes of the state
712-
// (logits, embedding and kv_cache)
788+
// (logits, embedding and memory)
713789
// Only use when saving the state, not when restoring it, otherwise the size may be too small.
714790
LLAMA_API size_t llama_state_get_size(struct llama_context * ctx);
715791
LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx),
@@ -765,12 +841,12 @@ extern "C" {
765841
size_t n_token_count),
766842
"use llama_state_save_file instead");
767843

768-
// Get the exact size needed to copy the KV cache of a single sequence
844+
// Get the exact size needed to copy the state of a single sequence
769845
LLAMA_API size_t llama_state_seq_get_size(
770846
struct llama_context * ctx,
771847
llama_seq_id seq_id);
772848

773-
// Copy the KV cache of a single sequence into the specified buffer
849+
// Copy the state of a single sequence into the specified buffer
774850
LLAMA_API size_t llama_state_seq_get_data(
775851
struct llama_context * ctx,
776852
uint8_t * dst,
@@ -836,16 +912,16 @@ extern "C" {
836912
// For encode-decoder contexts, processes the batch using the encoder.
837913
// Can store the encoder output internally for later use by the decoder's cross-attention layers.
838914
// 0 - success
839-
// < 0 - error. the KV cache state is restored to the state before this call
915+
// < 0 - error. the memory state is restored to the state before this call
840916
LLAMA_API int32_t llama_encode(
841917
struct llama_context * ctx,
842918
struct llama_batch batch);
843919

844920
// Process a batch of tokens.
845-
// Requires KV cache.
921+
// Requires the context to have a memory.
846922
// For encode-decoder contexts, processes the batch using the decoder.
847923
// Positive return values does not mean a fatal error, but rather a warning.
848-
// Upon non-zero return values, the KV cache state is restored to the state before this call
924+
// Upon non-zero return values, the memory state is restored to the state before this call
849925
// 0 - success
850926
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
851927
// 2 - aborted

src/CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ add_library(llama
2020
llama-hparams.cpp
2121
llama-impl.cpp
2222
llama-io.cpp
23-
llama-kv-cache.cpp
2423
llama-kv-cache-unified.cpp
2524
llama-kv-cache-unified-iswa.cpp
2625
llama-kv-cache-recurrent.cpp

0 commit comments

Comments
 (0)