small refactor

JohannesGaessler · JohannesGaessler · commit ac857c7eeaab · 2023-09-05T00:44:59.000+02:00
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
@@ -1414,7 +1414,9 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
     v.y = x[ib + iqs + 1];
 }
 
-static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
+static __global__ void quantize_q8_1(
+    const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded, const int nchannels) {
+
     const int ix = blockDim.x*blockIdx.x + threadIdx.x;
 
     if (ix >= kx_padded) {
@@ -4292,11 +4294,13 @@ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, con
     rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
 }
 
-static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
+static void quantize_row_q8_1_cuda(
+    const float * x, void * vy, const int kx, const int ky, const int kx_padded, const int nchannels, cudaStream_t stream) {
+
     const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
-    const dim3 num_blocks(block_num_x, ky, 1);
+    const dim3 num_blocks(block_num_x, ky*nchannels, 1);
     const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1);
-    quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
+    quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded, nchannels);
 }
 
 static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
@@ -5552,7 +5556,7 @@ inline void ggml_cuda_op_mul_mat_q(
         ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
     size_t as;
     void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*ne11*nchannels*sizeof(block_q8_1)/QK8_1, &as);
-    quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, ne11*nchannels, padded_row_size, cudaStream_main);
+    quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, ne11, padded_row_size, nchannels, cudaStream_main);
 
     // const int row_stride       = nb01 / ggml_type_size(src0->type);
     const int row_stride = src0->backend == GGML_BACKEND_GPU && src1->backend == GGML_BACKEND_GPU &&
@@ -5706,7 +5710,7 @@ inline void ggml_cuda_op_mul_mat_vec(
             ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
         size_t as;
         void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*ne02*sizeof(block_q8_1)/QK8_1, &as);
-        quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, ne02, padded_row_size, cudaStream_main);
+        quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, 1, padded_row_size, ne02, cudaStream_main);
 
         const int row_delta       = nb01 / ggml_type_size(src0->type);
         const int channel_delta   = nb02 / ggml_type_size(src0->type);