@@ -1414,7 +1414,9 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
1414
1414
v.y = x[ib + iqs + 1 ];
1415
1415
}
1416
1416
1417
- static __global__ void quantize_q8_1 (const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
1417
+ static __global__ void quantize_q8_1 (
1418
+ const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded, const int nchannels) {
1419
+
1418
1420
const int ix = blockDim .x *blockIdx .x + threadIdx .x ;
1419
1421
1420
1422
if (ix >= kx_padded) {
@@ -4292,11 +4294,13 @@ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, con
4292
4294
rms_norm_f32<<<nrows, block_dims, 0 , stream>>> (x, dst, ncols, eps);
4293
4295
}
4294
4296
4295
- static void quantize_row_q8_1_cuda (const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
4297
+ static void quantize_row_q8_1_cuda (
4298
+ const float * x, void * vy, const int kx, const int ky, const int kx_padded, const int nchannels, cudaStream_t stream) {
4299
+
4296
4300
const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1 ) / CUDA_QUANTIZE_BLOCK_SIZE;
4297
- const dim3 num_blocks (block_num_x, ky, 1 );
4301
+ const dim3 num_blocks (block_num_x, ky*nchannels , 1 );
4298
4302
const dim3 block_size (CUDA_DEQUANTIZE_BLOCK_SIZE, 1 , 1 );
4299
- quantize_q8_1<<<num_blocks, block_size, 0 , stream>>> (x, vy, kx, kx_padded);
4303
+ quantize_q8_1<<<num_blocks, block_size, 0 , stream>>> (x, vy, kx, kx_padded, nchannels );
4300
4304
}
4301
4305
4302
4306
static void dequantize_row_q4_0_cuda (const void * vx, float * y, const int k, cudaStream_t stream) {
@@ -5552,7 +5556,7 @@ inline void ggml_cuda_op_mul_mat_q(
5552
5556
ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
5553
5557
size_t as;
5554
5558
void * src1_q8_1 = ggml_cuda_pool_malloc (padded_row_size*ne11*nchannels*sizeof (block_q8_1)/QK8_1, &as);
5555
- quantize_row_q8_1_cuda (src1_ddf_i, src1_q8_1, ne10, ne11*nchannels , padded_row_size, cudaStream_main);
5559
+ quantize_row_q8_1_cuda (src1_ddf_i, src1_q8_1, ne10, ne11, padded_row_size, nchannels , cudaStream_main);
5556
5560
5557
5561
// const int row_stride = nb01 / ggml_type_size(src0->type);
5558
5562
const int row_stride = src0->backend == GGML_BACKEND_GPU && src1->backend == GGML_BACKEND_GPU &&
@@ -5706,7 +5710,7 @@ inline void ggml_cuda_op_mul_mat_vec(
5706
5710
ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
5707
5711
size_t as;
5708
5712
void * src1_q8_1 = ggml_cuda_pool_malloc (padded_row_size*ne02*sizeof (block_q8_1)/QK8_1, &as);
5709
- quantize_row_q8_1_cuda (src1_ddf_i, src1_q8_1, ne10, ne02 , padded_row_size, cudaStream_main);
5713
+ quantize_row_q8_1_cuda (src1_ddf_i, src1_q8_1, ne10, 1 , padded_row_size, ne02 , cudaStream_main);
5710
5714
5711
5715
const int row_delta = nb01 / ggml_type_size (src0->type );
5712
5716
const int channel_delta = nb02 / ggml_type_size (src0->type );
0 commit comments