vulkan: fix noncontig check for mat_mul_id splitting (ggml-org#14683)

jeffbolznv · web-flow · commit ba1ceb34566c · 2025-07-15T21:51:09.000+02:00
* vulkan: fix noncontig check for mat_mul_id splitting

Remove supports_op check for &gt; 4096 (splitting fixes this)

* vulkan: fix batched matmul dequant for Q*_K
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -4922,7 +4922,7 @@ static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) {
     return
         tensor->nb[0] == ggml_type_size(tensor->type) &&
         tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
-        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
+        (tensor->ne[3] == 1 || tensor->nb[3] == tensor->nb[2]*tensor->ne[2]);
 }
 
 static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src, const ggml_tensor * dst, ggml_type to) {
@@ -10356,10 +10356,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                         // If there's not enough shared memory for row_ids and the result tile, fallback to CPU
                         return false;
                     }
-                    // Check against size of shared memory variable
-                    if (op->src[2]->ne[0] > 4096) {
-                        return false;
-                    }
                 }
                 switch (src0_type) {
                     case GGML_TYPE_F32:
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp
@@ -10,7 +10,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
 void main() {
     [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
         const uint i = gl_WorkGroupID.x * 256 + wgy;
-        if (i >= p.M * p.K / QUANT_K) {
+        if (i >= p.nel / QUANT_K) {
             return;
         }
 
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp
@@ -10,7 +10,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
 void main() {
     [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
         const uint i = uint(gl_WorkGroupID.x * 256 + wgy);
-        if (i >= p.M * p.K / QUANT_K) {
+        if (i >= p.nel / QUANT_K) {
             return;
         }
 
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp
@@ -10,7 +10,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
 void main() {
     [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
         const uint ib = gl_WorkGroupID.x * 256 + wgy;
-        if (ib >= p.M * p.K / QUANT_K) {
+        if (ib >= p.nel / QUANT_K) {
             return;
         }
 
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp
@@ -10,7 +10,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
 void main() {
     [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
         const uint ib = gl_WorkGroupID.x * 256 + wgy;
-        if (ib >= p.M * p.K / QUANT_K) {
+        if (ib >= p.nel / QUANT_K) {
             return;
         }
 
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp
@@ -10,7 +10,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
 void main() {
     [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
         const uint i = gl_WorkGroupID.x * 256 + wgy;
-        if (i >= p.M * p.K / QUANT_K) {
+        if (i >= p.nel / QUANT_K) {
             return;
         }
         const uint tid = gl_LocalInvocationID.x;

Original file line number	Diff line number	Diff line change
`@@ -4922,7 +4922,7 @@ static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) {`
`4922`	`4922`	`return`
`4923`	`4923`	`tensor->nb[0] == ggml_type_size(tensor->type) &&`
`4924`	`4924`	`tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&`
`4925`		`- tensor->nb[3] == tensor->nb[2]*tensor->ne[2];`
	`4925`	`+ (tensor->ne[3] == 1 \|\| tensor->nb[3] == tensor->nb[2]*tensor->ne[2]);`
`4926`	`4926`	`}`
`4927`	`4927`
`4928`	`4928`	`static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src, const ggml_tensor * dst, ggml_type to) {`
`@@ -10356,10 +10356,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm`
`10356`	`10356`	`// If there's not enough shared memory for row_ids and the result tile, fallback to CPU`
`10357`	`10357`	`return false;`
`10358`	`10358`	`}`
`10359`		`- // Check against size of shared memory variable`
`10360`		`- if (op->src[2]->ne[0] > 4096) {`
`10361`		`- return false;`
`10362`		`- }`
`10363`	`10359`	`}`
`10364`	`10360`	`switch (src0_type) {`
`10365`	`10361`	`case GGML_TYPE_F32:`
Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};`
`10`	`10`	`void main() {`
`11`	`11`	`[[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {`
`12`	`12`	`const uint i = gl_WorkGroupID.x * 256 + wgy;`
`13`		`- if (i >= p.M * p.K / QUANT_K) {`
	`13`	`+ if (i >= p.nel / QUANT_K) {`
`14`	`14`	`return;`
`15`	`15`	`}`
`16`	`16`