Skip to content

Commit bb2bb37

Browse files
committed
Merge remote-tracking branch 'origin/master' into GraniteFour
* origin/master: llama : initial Mamba-2 support (ggml-org#9126) sync : ggml ggml : add version function to get lib version (ggml/1286) Set RPATH to "@loader_path" / "$ORIGIN" to ensure executables and dynamic libraries search for dependencies in their origin directory. (ggml-org#14309) CUDA: add softmax broadcast (ggml-org#14475) CUDA: broadcasting for FlashAttention mask (ggml-org#14500) vulkan: support softmax/FA batch and broadcast (ggml-org#14449) ggml : support bcast ggml_soft_max_ext, ggml_flash_attn_ext (ggml-org#14435) opencl : fix possible buffer overflow in dump_tensor (ggml-org#14490) simple-chat : fix context-exceeded condition (ggml-org#14494) opencl : skip empty nodes on cgraph compute (ggml-org#14491) opencl : update upscale to support align corners (ggml-org#14488) ci : add OpenCL to labeler workflow (ggml-org#14496) github : add OpenCL backend to issue templates (ggml-org#14492) ggml : Callback before abort (ggml-org#14481) ci : disable fast-math for Metal GHA CI (ggml-org#14478)
2 parents 28361c4 + 5d46bab commit bb2bb37

40 files changed

+572
-302
lines changed

.github/ISSUE_TEMPLATE/010-bug-compilation.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ body:
4040
attributes:
4141
label: GGML backends
4242
description: Which GGML backends do you know to be affected?
43-
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
43+
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
4444
multiple: true
4545
validations:
4646
required: true

.github/ISSUE_TEMPLATE/011-bug-results.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ body:
4242
attributes:
4343
label: GGML backends
4444
description: Which GGML backends do you know to be affected?
45-
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
45+
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
4646
multiple: true
4747
validations:
4848
required: true

.github/labeler.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,3 +93,8 @@ Ascend NPU:
9393
- ggml/include/ggml-cann.h
9494
- ggml/src/ggml-cann/**
9595
- docs/backend/CANN.md
96+
OpenCL:
97+
- changed-files:
98+
- any-glob-to-any-file:
99+
- ggml/include/ggml-opencl.h
100+
- ggml/src/ggml-opencl/**

.github/workflows/build.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,8 @@ jobs:
8484
-DCMAKE_BUILD_RPATH="@loader_path" \
8585
-DLLAMA_FATAL_WARNINGS=ON \
8686
-DGGML_METAL_USE_BF16=ON \
87-
-DGGML_METAL_EMBED_LIBRARY=ON \
87+
-DGGML_METAL_EMBED_LIBRARY=OFF \
88+
-DGGML_METAL_SHADER_DEBUG=ON \
8889
-DGGML_RPC=ON
8990
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
9091

.github/workflows/release.yml

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,8 @@ jobs:
4949
run: |
5050
sysctl -a
5151
cmake -B build \
52-
-DCMAKE_BUILD_RPATH="@loader_path" \
52+
-DCMAKE_INSTALL_RPATH='@loader_path' \
53+
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
5354
-DLLAMA_FATAL_WARNINGS=ON \
5455
-DGGML_METAL_USE_BF16=ON \
5556
-DGGML_METAL_EMBED_LIBRARY=ON \
@@ -103,7 +104,8 @@ jobs:
103104
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
104105
# https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
105106
cmake -B build \
106-
-DCMAKE_BUILD_RPATH="@loader_path" \
107+
-DCMAKE_INSTALL_RPATH='@loader_path' \
108+
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
107109
-DLLAMA_FATAL_WARNINGS=ON \
108110
-DGGML_METAL=OFF \
109111
-DGGML_RPC=ON
@@ -160,6 +162,8 @@ jobs:
160162
id: cmake_build
161163
run: |
162164
cmake -B build \
165+
-DCMAKE_INSTALL_RPATH='$ORIGIN' \
166+
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
163167
-DGGML_BACKEND_DL=ON \
164168
-DGGML_NATIVE=OFF \
165169
-DGGML_CPU_ALL_VARIANTS=ON \
@@ -211,6 +215,8 @@ jobs:
211215
id: cmake_build
212216
run: |
213217
cmake -B build \
218+
-DCMAKE_INSTALL_RPATH='$ORIGIN' \
219+
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
214220
-DGGML_BACKEND_DL=ON \
215221
-DGGML_NATIVE=OFF \
216222
-DGGML_CPU_ALL_VARIANTS=ON \

convert_hf_to_gguf.py

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4877,7 +4877,7 @@ def __init__(self, dir_model: Path, *args, **kwargs):
48774877
super().__init__(dir_model, *args, hparams=hparams, **kwargs)
48784878
self.d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
48794879
self.d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * self.d_model
4880-
self.n_group = self.hparams.get("n_groups", 1)
4880+
self.n_group = self.find_hparam(["n_groups"], optional=True) or 1
48814881

48824882
def set_vocab(self):
48834883
vocab_size = self.hparams["vocab_size"]
@@ -4900,30 +4900,27 @@ def set_vocab(self):
49004900
self._set_vocab_builtin("gpt-neox", vocab_size)
49014901

49024902
def set_gguf_parameters(self):
4903-
d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
4904-
d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
4905-
d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model
4906-
d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 128
4907-
head_dim = self.find_hparam(["head_dim"], optional=True) or 64
4908-
n_group = self.find_hparam(["n_groups"], optional=True) or 1
4903+
d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
4904+
d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 128
4905+
head_dim = self.find_hparam(["head_dim"], optional=True) or 64
49094906

49104907
rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
49114908

49124909
# Fail early for models which don't have a block expansion factor of 2
49134910
# TODO: does this really matter?
4914-
assert d_inner == 2 * d_model
4915-
assert d_inner % head_dim == 0
4911+
assert self.d_inner == 2 * self.d_model
4912+
assert self.d_inner % head_dim == 0
49164913

49174914
self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
4918-
self.gguf_writer.add_embedding_length(d_model)
4915+
self.gguf_writer.add_embedding_length(self.d_model)
49194916
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
49204917
self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
49214918
self.gguf_writer.add_block_count(self.block_count)
49224919
self.gguf_writer.add_ssm_conv_kernel(d_conv)
4923-
self.gguf_writer.add_ssm_inner_size(d_inner)
4920+
self.gguf_writer.add_ssm_inner_size(self.d_inner)
49244921
self.gguf_writer.add_ssm_state_size(d_state)
4925-
self.gguf_writer.add_ssm_time_step_rank(d_inner // head_dim)
4926-
self.gguf_writer.add_ssm_group_count(n_group)
4922+
self.gguf_writer.add_ssm_time_step_rank(self.d_inner // head_dim)
4923+
self.gguf_writer.add_ssm_group_count(self.n_group)
49274924
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
49284925
self.gguf_writer.add_file_type(self.ftype)
49294926

examples/simple-chat/simple-chat.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -113,15 +113,16 @@ int main(int argc, char ** argv) {
113113
while (true) {
114114
// check if we have enough space in the context to evaluate this batch
115115
int n_ctx = llama_n_ctx(ctx);
116-
int n_ctx_used = llama_memory_seq_pos_max(llama_get_memory(ctx), 0);
116+
int n_ctx_used = llama_memory_seq_pos_max(llama_get_memory(ctx), 0) + 1;
117117
if (n_ctx_used + batch.n_tokens > n_ctx) {
118118
printf("\033[0m\n");
119119
fprintf(stderr, "context size exceeded\n");
120120
exit(0);
121121
}
122122

123-
if (llama_decode(ctx, batch)) {
124-
GGML_ABORT("failed to decode\n");
123+
int ret = llama_decode(ctx, batch);
124+
if (ret != 0) {
125+
GGML_ABORT("failed to decode, ret = %d\n", ret);
125126
}
126127

127128
// sample the next token

ggml/CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,13 @@ write_basic_package_version_file(
360360
VERSION ${GGML_INSTALL_VERSION}
361361
COMPATIBILITY SameMajorVersion)
362362

363+
target_compile_definitions(ggml-base PRIVATE
364+
GGML_VERSION="${GGML_INSTALL_VERSION}"
365+
GGML_COMMIT="${GGML_BUILD_COMMIT}"
366+
)
367+
message(STATUS "ggml version: ${GGML_INSTALL_VERSION}")
368+
message(STATUS "ggml commit: ${GGML_BUILD_COMMIT}")
369+
363370
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
364371
${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
365372
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml)

ggml/include/ggml.h

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,13 @@
314314
extern "C" {
315315
#endif
316316

317+
// Function type used in fatal error callbacks
318+
typedef void (*ggml_abort_callback_t)(const char * error_message);
319+
320+
// Set the abort callback (passing null will restore original abort functionality: printing a message to stdout)
321+
// Returns the old callback for chaining
322+
GGML_API ggml_abort_callback_t ggml_set_abort_callback(ggml_abort_callback_t callback);
323+
317324
GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
318325
GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
319326

@@ -639,6 +646,9 @@ extern "C" {
639646

640647
// misc
641648

649+
GGML_API const char * ggml_version(void);
650+
GGML_API const char * ggml_commit(void);
651+
642652
GGML_API void ggml_time_init(void); // call this once at the beginning of the program
643653
GGML_API int64_t ggml_time_ms(void);
644654
GGML_API int64_t ggml_time_us(void);
@@ -1503,8 +1513,14 @@ extern "C" {
15031513
struct ggml_context * ctx,
15041514
struct ggml_tensor * a);
15051515

1516+
// a [ne0, ne01, ne02, ne03]
1517+
// mask [ne0, ne11, ne12, ne13] | ne11 >= ne01, F16 or F32, optional
1518+
//
1519+
// broadcast:
1520+
// ne02 % ne12 == 0
1521+
// ne03 % ne13 == 0
1522+
//
15061523
// fused soft_max(a*scale + mask*(ALiBi slope))
1507-
// mask is optional
15081524
// max_bias = 0.0f for no ALiBi
15091525
GGML_API struct ggml_tensor * ggml_soft_max_ext(
15101526
struct ggml_context * ctx,
@@ -1967,11 +1983,16 @@ extern "C" {
19671983

19681984
#define GGML_KQ_MASK_PAD 64
19691985

1970-
// q: [n_embd_k, n_batch, n_head, 1]
1971-
// k: [n_embd_k, n_kv, n_head_kv, 1]
1972-
// v: [n_embd_v, n_kv, n_head_kv, 1] !! not transposed !!
1973-
// mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
1974-
// res: [n_embd_v, n_head, n_batch, 1] !! permuted !!
1986+
// q: [n_embd_k, n_batch, n_head, ne3]
1987+
// k: [n_embd_k, n_kv, n_head_kv, ne3]
1988+
// v: [n_embd_v, n_kv, n_head_kv, ne3] !! not transposed !!
1989+
// mask: [n_kv, n_batch_pad, ne32, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
1990+
// res: [n_embd_v, n_head, n_batch, ne3] !! permuted !!
1991+
//
1992+
// broadcast:
1993+
// n_head % n_head_kv == 0
1994+
// ne3 % ne32 == 0
1995+
//
19751996
GGML_API struct ggml_tensor * ggml_flash_attn_ext(
19761997
struct ggml_context * ctx,
19771998
struct ggml_tensor * q,

ggml/src/ggml-cann/ggml-cann.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2187,7 +2187,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
21872187
case GGML_OP_SQRT:
21882188
case GGML_OP_CLAMP:
21892189
case GGML_OP_DIAG_MASK_INF:
2190-
case GGML_OP_SOFT_MAX:
21912190
case GGML_OP_SUM_ROWS:
21922191
case GGML_OP_ARGSORT:
21932192
case GGML_OP_ACC:
@@ -2205,6 +2204,10 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
22052204
case GGML_OP_PAD_REFLECT_1D:
22062205
case GGML_OP_COUNT_EQUAL:
22072206
return true;
2207+
case GGML_OP_SOFT_MAX:
2208+
// TODO: support broadcast
2209+
// ref: https://github.com/ggml-org/llama.cpp/pull/14435
2210+
return !op->src[1] || (op->src[1]->ne[2] == 1 && op->src[1]->ne[3] == 1);
22082211
case GGML_OP_FLASH_ATTN_EXT:{
22092212
// derived from [ggml-cuda.cu]
22102213
if(op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16){
@@ -2227,6 +2230,8 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
22272230
// DeepSeek MLA
22282231
return false;
22292232
}
2233+
// TODO: support broadcast
2234+
// ref: https://github.com/ggml-org/llama.cpp/pull/14435
22302235
if (op->src[0]->ne[3] != 1) {
22312236
return false;
22322237
}

0 commit comments

Comments
 (0)