@@ -1415,7 +1415,8 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
1415
1415
}
1416
1416
1417
1417
static __global__ void quantize_q8_1 (
1418
- const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded, const int ky) {
1418
+ const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded, const int ky,
1419
+ const int row_stride, const int channel_stride) {
1419
1420
1420
1421
const int ix = blockDim .x *blockIdx .x + threadIdx .x ;
1421
1422
@@ -1433,7 +1434,7 @@ static __global__ void quantize_q8_1(
1433
1434
const int ib = i_padded / QK8_1; // block index
1434
1435
const int iqs = i_padded % QK8_1; // quant index
1435
1436
1436
- const float xi = ix < kx ? x[channel*ky*kx + iy*kx + ix] : 0 .0f ;
1437
+ const float xi = ix < kx ? x[channel*channel_stride + iy*row_stride + ix] : 0 .0f ;
1437
1438
float amax = fabsf (xi);
1438
1439
float sum = xi;
1439
1440
@@ -4301,7 +4302,7 @@ static void quantize_row_q8_1_cuda(
4301
4302
const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1 ) / CUDA_QUANTIZE_BLOCK_SIZE;
4302
4303
const dim3 num_blocks (block_num_x, ky, nchannels);
4303
4304
const dim3 block_size (CUDA_DEQUANTIZE_BLOCK_SIZE, 1 , 1 );
4304
- quantize_q8_1<<<num_blocks, block_size, 0 , stream>>> (x, vy, kx, kx_padded, ky);
4305
+ quantize_q8_1<<<num_blocks, block_size, 0 , stream>>> (x, vy, kx, kx_padded, ky, kx, ky*kx );
4305
4306
}
4306
4307
4307
4308
static void dequantize_row_q4_0_cuda (const void * vx, float * y, const int k, cudaStream_t stream) {
0 commit comments