@@ -1760,11 +1760,13 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
1760
1760
// printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
1761
1761
// printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
1762
1762
1763
- if (!split && src0->type == GGML_TYPE_F16 && src1->ne [1 ] == 1 && dst->ne [3 ] == 1 && (src0->ne [1 ] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
1763
+ if (!split && use_mul_mat_vec && dst->ne [3 ] == 1 && (src0->ne [1 ] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
1764
+ // the custom F16 vector kernel can be used over batched cuBLAS GEMM
1765
+ // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
1764
1766
ggml_cuda_mul_mat_vec (ctx, src0, src1, dst);
1765
1767
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
1766
1768
&& !ggml_is_transposed (src0) && !ggml_is_transposed (src1) && src1->ne [2 ]*src1->ne [3 ] > 1 ) {
1767
- // KQ + KQV multi-batch without FlashAttention
1769
+ // general KQ + KQV multi-batch without FlashAttention
1768
1770
ggml_cuda_mul_mat_batched_cublas (ctx, src0, src1, dst);
1769
1771
} else if (use_mul_mat_vec) {
1770
1772
ggml_cuda_op_mul_mat (ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec, nullptr );
0 commit comments