Skip to content

Commit 7ce23b2

Browse files
q4_1
1 parent 104138c commit 7ce23b2

File tree

1 file changed

+83
-9
lines changed

1 file changed

+83
-9
lines changed

ggml-cuda.cu

Lines changed: 83 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1189,7 +1189,7 @@ static __global__ void quantize_q8_0(const float * x, void * vy, const int k) {
11891189
}
11901190

11911191
static __device__ float vec_dot_q4_0_q8_0(const void * vbq, const block_q8_0 * bq8_0, const int iqs) {
1192-
const block_q8_0 * bq4_0 = (const block_q8_0 *) vbq;
1192+
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
11931193

11941194
int vi;
11951195
int ui0, ui1;
@@ -1209,6 +1209,31 @@ static __device__ float vec_dot_q4_0_q8_0(const void * vbq, const block_q8_0 * b
12091209

12101210
}
12111211

1212+
static __device__ float vec_dot_q4_1_q8_0(const void * vbq, const block_q8_0 * bq8_0, const int iqs) {
1213+
const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
1214+
1215+
int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
1216+
int ui0, ui1;
1217+
memcpy(&ui0, &bq8_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
1218+
memcpy(&ui1, &bq8_0->qs[sizeof(int) * (iqs + 4)], sizeof(int));
1219+
1220+
const float d4_1 = bq4_1->d;
1221+
const float m = bq4_1->m;
1222+
const float d8_0 = bq8_0->d;
1223+
1224+
const int vi0 = (vi >> 0) & 0x0F0F0F0F;
1225+
const int vi1 = (vi >> 4) & 0x0F0F0F0F;
1226+
1227+
const int sumi0 = __dp4a(vi0, ui0, 0);
1228+
const int sumi1 = __dp4a(vi1, ui1, 0);
1229+
1230+
const int sumi2 = __dp4a(0x01010101, ui0, 0);
1231+
const int sumi3 = __dp4a(0x01010101, ui1, 0);
1232+
1233+
return (sumi0 + sumi1)*d4_1*d8_0 + (sumi2 + sumi3)*m*d8_0;
1234+
1235+
}
1236+
12121237
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
12131238
static __global__ void dequantize_block(const void * vx, float * y, const int k) {
12141239
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
@@ -1754,6 +1779,15 @@ static void mul_mat_vec_q4_0_q8_0_cuda(const void * vx, const void * vy, float *
17541779
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
17551780
}
17561781

1782+
static void mul_mat_vec_q4_1_q8_0_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1783+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1784+
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1785+
const dim3 block_nums(1, block_num_y, 1);
1786+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1787+
mul_mat_vec_q<QK4_0, block_q4_1, vec_dot_q4_1_q8_0>
1788+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
1789+
}
1790+
17571791
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
17581792
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
17591793
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
@@ -2199,15 +2233,9 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
21992233
dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
22002234
#endif // GGML_CUDA_DMMV_F16
22012235

2202-
size_t as;
2203-
void * src1_q8_0;
22042236
switch (src0->type) {
22052237
case GGML_TYPE_Q4_0:
2206-
GGML_ASSERT(ne00 % QK8_0 == 0);
2207-
src1_q8_0 = ggml_cuda_pool_malloc(ne00*sizeof(block_q8_0)/QK8_0, &as);
2208-
quantize_row_q8_0_cuda(src1_ddf_i, src1_q8_0, ne00, cudaStream_main);
2209-
mul_mat_vec_q4_0_q8_0_cuda(src0_ddq_i, src1_q8_0, dst_ddf_i, ne00, nrows, cudaStream_main);
2210-
ggml_cuda_pool_free(src1_q8_0, as);
2238+
dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
22112239
break;
22122240
case GGML_TYPE_Q4_1:
22132241
dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
@@ -2258,6 +2286,46 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
22582286
(void) i1;
22592287
}
22602288

2289+
inline void ggml_cuda_op_mul_mat_vec_q(
2290+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
2291+
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
2292+
cudaStream_t & cudaStream_main){
2293+
2294+
GGML_ASSERT(src0_ddq_i != nullptr);
2295+
GGML_ASSERT(src1_ddf_i != nullptr);
2296+
GGML_ASSERT(dst_ddf_i != nullptr);
2297+
2298+
const int64_t ne00 = src0->ne[0];
2299+
const int64_t nrows = i01_high - i01_low;
2300+
2301+
GGML_ASSERT(ne00 % QK8_0 == 0);
2302+
2303+
size_t as;
2304+
void * src1_q8_0 = ggml_cuda_pool_malloc(ne00*sizeof(block_q8_0)/QK8_0, &as);
2305+
quantize_row_q8_0_cuda(src1_ddf_i, src1_q8_0, ne00, cudaStream_main);
2306+
2307+
switch (src0->type) {
2308+
case GGML_TYPE_Q4_0:
2309+
mul_mat_vec_q4_0_q8_0_cuda(src0_ddq_i, src1_q8_0, dst_ddf_i, ne00, nrows, cudaStream_main);
2310+
break;
2311+
case GGML_TYPE_Q4_1:
2312+
mul_mat_vec_q4_1_q8_0_cuda(src0_ddq_i, src1_q8_0, dst_ddf_i, ne00, nrows, cudaStream_main);
2313+
break;
2314+
default:
2315+
GGML_ASSERT(false);
2316+
break;
2317+
}
2318+
CUDA_CHECK(cudaGetLastError());
2319+
2320+
ggml_cuda_pool_free(src1_q8_0, as);
2321+
2322+
(void) src1;
2323+
(void) dst;
2324+
(void) src0_ddf_i;
2325+
(void) i02;
2326+
(void) i1;
2327+
}
2328+
22612329
inline void ggml_cuda_op_mul_mat_cublas(
22622330
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
22632331
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@@ -2798,7 +2866,13 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
27982866
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
27992867
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
28002868
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[1] % GGML_CUDA_DMMV_Y == 0) {
2801-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false, false);
2869+
bool use_mul_mat_vec_q = false;
2870+
use_mul_mat_vec_q = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1;
2871+
if (use_mul_mat_vec_q) {
2872+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, false, false);
2873+
} else {
2874+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false, false);
2875+
}
28022876
} else {
28032877
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
28042878
}

0 commit comments

Comments
 (0)