backends : unsupport batched FA in CUDA and Vulkan

ggerganov · ggerganov · commit 89ee2f1cc535 · 2025-07-03T07:56:44.000+03:00
ggml-ci
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3377,7 +3377,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                 return false;
             }
             // TODO: support broadcast
-            // ref: https://github.com/ggml-org/llama.cpp/pull/14435
+            // note: this was initially implemented in https://github.com/ggml-org/llama.cpp/pull/14500, but
+            //       the interface of ggml_flash_attn_ext() changed in https://github.com/ggml-org/llama.cpp/pull/14505
             if (op->src[0]->ne[3] != 1) {
                 return false;
             }
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -10265,6 +10265,12 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                 if (op->src[3] && op->src[3]->type != GGML_TYPE_F16) {
                     return false;
                 }
+                // TODO: support broadcast
+                // note: this was initially implemented in https://github.com/ggml-org/llama.cpp/pull/14449, but
+                //       the interface of ggml_flash_attn_ext() changed in https://github.com/ggml-org/llama.cpp/pull/14505
+                if (op->src[0]->ne[3] != 1) {
+                    return false;
+                }
                 // It's straightforward to support different K/V dequant, but would
                 // significantly increase the number of pipelines
                 if (op->src[1]->type != op->src[2]->type) {

Original file line number	Diff line number	Diff line change
`@@ -3377,7 +3377,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g`
`3377`	`3377`	`return false;`
`3378`	`3378`	`}`
`3379`	`3379`	`// TODO: support broadcast`
`3380`		`- // ref: https://github.com/ggml-org/llama.cpp/pull/14435`
	`3380`	`+ // note: this was initially implemented in https://github.com/ggml-org/llama.cpp/pull/14500, but`
	`3381`	`+ // the interface of ggml_flash_attn_ext() changed in https://github.com/ggml-org/llama.cpp/pull/14505`
`3381`	`3382`	`if (op->src[0]->ne[3] != 1) {`
`3382`	`3383`	`return false;`
`3383`	`3384`	`}`