mul_mat_vec_q for -ngl 34

JohannesGaessler · JohannesGaessler · commit 9fb224c0a34d · 2023-09-04T17:26:48.000+02:00
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
@@ -3753,22 +3753,24 @@ template <bool need_check> static __global__ void
 template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
 static __global__ void mul_mat_vec_q(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols, const int nrows, const int row_delta) {
+    const int ncols, const int nrows, const int row_delta, const int channel_delta_x, const int channel_delta_y) {
 
     const int row = blockIdx.y*blockDim.y + threadIdx.y;
 
     if (row >= nrows) {
         return;
     }
 
+    const int channel = blockIdx.z*blockDim.z + threadIdx.z;
+
     const int blocks_per_row = ncols / qk;
     const int blocks_per_warp = vdr * WARP_SIZE / qi;
 
 // partial sum for each thread
     float tmp = 0.0f;
 
-    const block_q_t  * x = (const block_q_t  *) vx;
-    const block_q8_1 * y = (const block_q8_1 *) vy;
+    const block_q_t  * x = ((const block_q_t  *) vx) + channel*channel_delta_x;
+    const block_q8_1 * y = ((const block_q8_1 *) vy) + channel*channel_delta_y;
 
     for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
         const int ibx = row*row_delta + i + threadIdx.x / (qi/vdr); // x block index
@@ -3787,7 +3789,7 @@ static __global__ void mul_mat_vec_q(
     }
 
     if (threadIdx.x == 0) {
-        dst[row] = tmp;
+        dst[channel*nrows + row] = tmp;
     }
 }
 
@@ -4439,94 +4441,124 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
     dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, const int row_delta, cudaStream_t stream) {
+static void mul_mat_vec_q4_0_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols, const int nrows, const int nchannels,
+    const int row_delta, const int channel_delta, const int channel_delta_y, cudaStream_t stream) {
+
     GGML_ASSERT(ncols % QK4_0 == 0);
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(1, block_num_y, nchannels);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
     mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, row_delta);
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, row_delta, channel_delta, channel_delta_y);
 }
 
-static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, const int row_delta, cudaStream_t stream) {
+static void mul_mat_vec_q4_1_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols, const int nrows, const int nchannels,
+    const int row_delta, const int channel_delta, const int channel_delta_y, cudaStream_t stream) {
+
     GGML_ASSERT(ncols % QK4_1 == 0);
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(1, block_num_y, nchannels);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
     mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, row_delta);
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, row_delta, channel_delta, channel_delta_y);
 }
 
-static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, const int row_delta, cudaStream_t stream) {
+static void mul_mat_vec_q5_0_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols, const int nrows, const int nchannels,
+    const int row_delta, const int channel_delta, const int channel_delta_y, cudaStream_t stream) {
+
     GGML_ASSERT(ncols % QK5_0 == 0);
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(1, block_num_y, nchannels);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
     mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, row_delta);
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, row_delta, channel_delta, channel_delta_y);
 }
 
-static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, const int row_delta, cudaStream_t stream) {
+static void mul_mat_vec_q5_1_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols, const int nrows, const int nchannels,
+    const int row_delta, const int channel_delta, const int channel_delta_y, cudaStream_t stream) {
+
     GGML_ASSERT(ncols % QK5_1 == 0);
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(1, block_num_y, nchannels);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
     mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, row_delta);
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, row_delta, channel_delta, channel_delta_y);
 }
 
-static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, const int row_delta, cudaStream_t stream) {
+static void mul_mat_vec_q8_0_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols, const int nrows, const int nchannels,
+    const int row_delta, const int channel_delta, const int channel_delta_y, cudaStream_t stream) {
+
     GGML_ASSERT(ncols % QK8_0 == 0);
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(1, block_num_y, nchannels);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
     mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, row_delta);
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, row_delta, channel_delta, channel_delta_y);
 }
 
-static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, const int row_delta, cudaStream_t stream) {
+static void mul_mat_vec_q2_K_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols, const int nrows, const int nchannels,
+    const int row_delta, const int channel_delta, const int channel_delta_y, cudaStream_t stream) {
+
     GGML_ASSERT(ncols % QK_K == 0);
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(1, block_num_y, nchannels);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
     mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, row_delta);
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, row_delta, channel_delta, channel_delta_y);
 }
 
-static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, const int row_delta, cudaStream_t stream) {
+static void mul_mat_vec_q3_K_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols, const int nrows, const int nchannels,
+    const int row_delta, const int channel_delta, const int channel_delta_y, cudaStream_t stream) {
+
     GGML_ASSERT(ncols % QK_K == 0);
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(1, block_num_y, nchannels);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
     mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, row_delta);
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, row_delta, channel_delta, channel_delta_y);
 }
 
-static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, const int row_delta, cudaStream_t stream) {
+static void mul_mat_vec_q4_K_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols, const int nrows, const int nchannels,
+    const int row_delta, const int channel_delta, const int channel_delta_y, cudaStream_t stream) {
+
     GGML_ASSERT(ncols % QK_K == 0);
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(1, block_num_y, nchannels);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
     mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, row_delta);
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, row_delta, channel_delta, channel_delta_y);
 }
 
-static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, const int row_delta, cudaStream_t stream) {
+static void mul_mat_vec_q5_K_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols, const int nrows, const int nchannels,
+    const int row_delta, const int channel_delta, const int channel_delta_y, cudaStream_t stream) {
+
     GGML_ASSERT(ncols % QK_K == 0);
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(1, block_num_y, nchannels);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
     mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, row_delta);
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, row_delta, channel_delta, channel_delta_y);
 }
 
-static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, const int row_delta, cudaStream_t stream) {
+static void mul_mat_vec_q6_K_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols, const int nrows, const int nchannels,
+    const int row_delta, const int channel_delta, const int channel_delta_y, cudaStream_t stream) {
+
     GGML_ASSERT(ncols % QK_K == 0);
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(1, block_num_y, nchannels);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
     mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, row_delta);
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, row_delta, channel_delta, channel_delta_y);
 }
 
 static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
@@ -5559,7 +5591,13 @@ inline void ggml_cuda_op_mul_mat_vec(
     GGML_ASSERT(dst_ddf_i != nullptr);
 
     const int64_t ne00 = src0->ne[0];
+    const int64_t ne02 = src0->ne[2];
+
+    const int64_t ne10 = src1->ne[0];
+
     const int64_t nb01 = src0->nb[1];
+    const int64_t nb02 = src0->nb[2];
+
     const int64_t nrows = i01_high - i01_low;
 
 #ifdef GGML_CUDA_FORCE_DMMV
@@ -5585,46 +5623,48 @@ inline void ggml_cuda_op_mul_mat_vec(
 #endif // QK_K == 256
 
     const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= MIN_CC_DP4A && mul_mat_vec_q_implemented;
-#endif
+#endif // GGML_CUDA_FORCE_DMMV
 
     if (use_mul_mat_vec_q) {
-        const int64_t padded_row_size = ne00 % MATRIX_ROW_PADDING == 0 ?
-            ne00 : ne00 - ne00 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
+        const int64_t padded_row_size = ne10 % MATRIX_ROW_PADDING == 0 ?
+            ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
         size_t as;
-        void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
-        quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, 1, padded_row_size, cudaStream_main);
+        void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*ne02*sizeof(block_q8_1)/QK8_1, &as);
+        quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, ne02, padded_row_size, cudaStream_main);
 
-        const int row_delta = nb01 / ggml_type_size(src0->type);
+        const int row_delta       = nb01 / ggml_type_size(src0->type);
+        const int channel_delta   = nb02 / ggml_type_size(src0->type);
+        const int channel_delta_y = padded_row_size / QK8_1;
         switch (src0->type) {
             case GGML_TYPE_Q4_0:
-                mul_mat_vec_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, row_delta, cudaStream_main);
+                mul_mat_vec_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, ne02, row_delta, channel_delta, channel_delta_y, cudaStream_main);
                 break;
             case GGML_TYPE_Q4_1:
-                mul_mat_vec_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, row_delta, cudaStream_main);
+                mul_mat_vec_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, ne02, row_delta, channel_delta, channel_delta_y, cudaStream_main);
                 break;
             case GGML_TYPE_Q5_0:
-                mul_mat_vec_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, row_delta, cudaStream_main);
+                mul_mat_vec_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, ne02, row_delta, channel_delta, channel_delta_y, cudaStream_main);
                 break;
             case GGML_TYPE_Q5_1:
-                mul_mat_vec_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, row_delta, cudaStream_main);
+                mul_mat_vec_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, ne02, row_delta, channel_delta, channel_delta_y, cudaStream_main);
                 break;
             case GGML_TYPE_Q8_0:
-                mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, row_delta, cudaStream_main);
+                mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, ne02, row_delta, channel_delta, channel_delta_y, cudaStream_main);
                 break;
             case GGML_TYPE_Q2_K:
-                mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, row_delta, cudaStream_main);
+                mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, ne02, row_delta, channel_delta, channel_delta_y, cudaStream_main);
                 break;
             case GGML_TYPE_Q3_K:
-                mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, row_delta, cudaStream_main);
+                mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, ne02, row_delta, channel_delta, channel_delta_y, cudaStream_main);
                 break;
             case GGML_TYPE_Q4_K:
-                mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, row_delta, cudaStream_main);
+                mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, ne02, row_delta, channel_delta, channel_delta_y, cudaStream_main);
                 break;
             case GGML_TYPE_Q5_K:
-                mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, row_delta, cudaStream_main);
+                mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, ne02, row_delta, channel_delta, channel_delta_y, cudaStream_main);
                 break;
             case GGML_TYPE_Q6_K:
-                mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, row_delta, cudaStream_main);
+                mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, ne02, row_delta, channel_delta, channel_delta_y, cudaStream_main);
                 break;
             default:
                 GGML_ASSERT(false);
@@ -5633,6 +5673,8 @@ inline void ggml_cuda_op_mul_mat_vec(
 
         ggml_cuda_pool_free(src1_q8_1, as);
     } else {
+        GGML_ASSERT(ne02 == 1);
+
         // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
 #ifdef GGML_CUDA_F16
         size_t ash;
@@ -6320,7 +6362,6 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
     GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1));
     GGML_ASSERT(!ggml_is_permuted(src0));
     GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
     GGML_ASSERT(src1->type == GGML_TYPE_F32);
 
     const int64_t ne00 = src0->ne[0];
@@ -6336,18 +6377,24 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
     cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
 
     struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    void * src0_ddq = src0_extra->data_device[g_main_device];
+    char * src0_ddq = (char *) src0_extra->data_device[g_main_device];
 
     struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
     float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
 
     struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
     float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
 
-    const int row_stride_x = nb01 / sizeof(half);
-    const int channel_stride_x = nb02 / sizeof(half);
+    if (src0->type == GGML_TYPE_F16) {
+        const int row_stride_x = nb01 / sizeof(half);
+        const int channel_stride_x = nb02 / sizeof(half);
 
-    ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, cudaStream_main);
+        ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, cudaStream_main);
+    } else if (ggml_is_quantized(src0->type)) {
+        ggml_cuda_op_mul_mat_vec(src0, src1, dst, src0_ddq, nullptr, src1_ddf, dst_ddf, 0, 0, ne01, 0, cudaStream_main);
+    } else {
+        GGML_ASSERT(false);
+    }
 }
 
 void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -6357,7 +6404,7 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
 
     if (all_on_device && !src0_is_quantized && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
         ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
-    } else if (all_on_device && !src0_is_quantized && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
+    } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
         ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
     }else if (src0->type == GGML_TYPE_F32) {
         ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);