Skip to content

Commit 89ee2f1

Browse files
committed
backends : unsupport batched FA in CUDA and Vulkan
ggml-ci
1 parent 6036177 commit 89ee2f1

File tree

2 files changed

+8
-1
lines changed

2 files changed

+8
-1
lines changed

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3377,7 +3377,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
33773377
return false;
33783378
}
33793379
// TODO: support broadcast
3380-
// ref: https://github.com/ggml-org/llama.cpp/pull/14435
3380+
// note: this was initially implemented in https://github.com/ggml-org/llama.cpp/pull/14500, but
3381+
// the interface of ggml_flash_attn_ext() changed in https://github.com/ggml-org/llama.cpp/pull/14505
33813382
if (op->src[0]->ne[3] != 1) {
33823383
return false;
33833384
}

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10265,6 +10265,12 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
1026510265
if (op->src[3] && op->src[3]->type != GGML_TYPE_F16) {
1026610266
return false;
1026710267
}
10268+
// TODO: support broadcast
10269+
// note: this was initially implemented in https://github.com/ggml-org/llama.cpp/pull/14449, but
10270+
// the interface of ggml_flash_attn_ext() changed in https://github.com/ggml-org/llama.cpp/pull/14505
10271+
if (op->src[0]->ne[3] != 1) {
10272+
return false;
10273+
}
1026810274
// It's straightforward to support different K/V dequant, but would
1026910275
// significantly increase the number of pipelines
1027010276
if (op->src[1]->type != op->src[2]->type) {

0 commit comments

Comments
 (0)