@@ -1189,7 +1189,7 @@ static __global__ void quantize_q8_0(const float * x, void * vy, const int k) {
1189
1189
}
1190
1190
1191
1191
static __device__ float vec_dot_q4_0_q8_0 (const void * vbq, const block_q8_0 * bq8_0, const int iqs) {
1192
- const block_q8_0 * bq4_0 = (const block_q8_0 *) vbq;
1192
+ const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
1193
1193
1194
1194
int vi;
1195
1195
int ui0, ui1;
@@ -1209,6 +1209,31 @@ static __device__ float vec_dot_q4_0_q8_0(const void * vbq, const block_q8_0 * b
1209
1209
1210
1210
}
1211
1211
1212
+ static __device__ float vec_dot_q4_1_q8_0 (const void * vbq, const block_q8_0 * bq8_0, const int iqs) {
1213
+ const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
1214
+
1215
+ int vi = *((int *) &bq4_1->qs [sizeof (int ) * (iqs + 0 )]);
1216
+ int ui0, ui1;
1217
+ memcpy (&ui0, &bq8_0->qs [sizeof (int ) * (iqs + 0 )], sizeof (int ));
1218
+ memcpy (&ui1, &bq8_0->qs [sizeof (int ) * (iqs + 4 )], sizeof (int ));
1219
+
1220
+ const float d4_1 = bq4_1->d ;
1221
+ const float m = bq4_1->m ;
1222
+ const float d8_0 = bq8_0->d ;
1223
+
1224
+ const int vi0 = (vi >> 0 ) & 0x0F0F0F0F ;
1225
+ const int vi1 = (vi >> 4 ) & 0x0F0F0F0F ;
1226
+
1227
+ const int sumi0 = __dp4a (vi0, ui0, 0 );
1228
+ const int sumi1 = __dp4a (vi1, ui1, 0 );
1229
+
1230
+ const int sumi2 = __dp4a (0x01010101 , ui0, 0 );
1231
+ const int sumi3 = __dp4a (0x01010101 , ui1, 0 );
1232
+
1233
+ return (sumi0 + sumi1)*d4_1*d8_0 + (sumi2 + sumi3)*m*d8_0;
1234
+
1235
+ }
1236
+
1212
1237
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
1213
1238
static __global__ void dequantize_block (const void * vx, float * y, const int k) {
1214
1239
const int i = blockDim .x *blockIdx .x + 2 *threadIdx .x ;
@@ -1754,6 +1779,15 @@ static void mul_mat_vec_q4_0_q8_0_cuda(const void * vx, const void * vy, float *
1754
1779
<<<block_nums, block_dims, 0 , stream>>> (vx, vy, dst, ncols, nrows);
1755
1780
}
1756
1781
1782
+ static void mul_mat_vec_q4_1_q8_0_cuda (const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1783
+ GGML_ASSERT (ncols % GGML_CUDA_DMMV_X == 0 );
1784
+ const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1 ) / GGML_CUDA_DMMV_Y;
1785
+ const dim3 block_nums (1 , block_num_y, 1 );
1786
+ const dim3 block_dims (WARP_SIZE, GGML_CUDA_DMMV_Y, 1 );
1787
+ mul_mat_vec_q<QK4_0, block_q4_1, vec_dot_q4_1_q8_0>
1788
+ <<<block_nums, block_dims, 0 , stream>>> (vx, vy, dst, ncols, nrows);
1789
+ }
1790
+
1757
1791
static void convert_fp16_to_fp32_cuda (const void * vx, float * y, const int k, cudaStream_t stream) {
1758
1792
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1 ) / CUDA_DEQUANTIZE_BLOCK_SIZE;
1759
1793
dequantize_block<1 , 1 , convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0 , stream>>> (vx, y, k);
@@ -2199,15 +2233,9 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
2199
2233
dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
2200
2234
#endif // GGML_CUDA_DMMV_F16
2201
2235
2202
- size_t as;
2203
- void * src1_q8_0;
2204
2236
switch (src0->type ) {
2205
2237
case GGML_TYPE_Q4_0:
2206
- GGML_ASSERT (ne00 % QK8_0 == 0 );
2207
- src1_q8_0 = ggml_cuda_pool_malloc (ne00*sizeof (block_q8_0)/QK8_0, &as);
2208
- quantize_row_q8_0_cuda (src1_ddf_i, src1_q8_0, ne00, cudaStream_main);
2209
- mul_mat_vec_q4_0_q8_0_cuda (src0_ddq_i, src1_q8_0, dst_ddf_i, ne00, nrows, cudaStream_main);
2210
- ggml_cuda_pool_free (src1_q8_0, as);
2238
+ dequantize_mul_mat_vec_q4_0_cuda (src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
2211
2239
break ;
2212
2240
case GGML_TYPE_Q4_1:
2213
2241
dequantize_mul_mat_vec_q4_1_cuda (src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
@@ -2258,6 +2286,46 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
2258
2286
(void ) i1;
2259
2287
}
2260
2288
2289
+ inline void ggml_cuda_op_mul_mat_vec_q (
2290
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
2291
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
2292
+ cudaStream_t & cudaStream_main){
2293
+
2294
+ GGML_ASSERT (src0_ddq_i != nullptr );
2295
+ GGML_ASSERT (src1_ddf_i != nullptr );
2296
+ GGML_ASSERT (dst_ddf_i != nullptr );
2297
+
2298
+ const int64_t ne00 = src0->ne [0 ];
2299
+ const int64_t nrows = i01_high - i01_low;
2300
+
2301
+ GGML_ASSERT (ne00 % QK8_0 == 0 );
2302
+
2303
+ size_t as;
2304
+ void * src1_q8_0 = ggml_cuda_pool_malloc (ne00*sizeof (block_q8_0)/QK8_0, &as);
2305
+ quantize_row_q8_0_cuda (src1_ddf_i, src1_q8_0, ne00, cudaStream_main);
2306
+
2307
+ switch (src0->type ) {
2308
+ case GGML_TYPE_Q4_0:
2309
+ mul_mat_vec_q4_0_q8_0_cuda (src0_ddq_i, src1_q8_0, dst_ddf_i, ne00, nrows, cudaStream_main);
2310
+ break ;
2311
+ case GGML_TYPE_Q4_1:
2312
+ mul_mat_vec_q4_1_q8_0_cuda (src0_ddq_i, src1_q8_0, dst_ddf_i, ne00, nrows, cudaStream_main);
2313
+ break ;
2314
+ default :
2315
+ GGML_ASSERT (false );
2316
+ break ;
2317
+ }
2318
+ CUDA_CHECK (cudaGetLastError ());
2319
+
2320
+ ggml_cuda_pool_free (src1_q8_0, as);
2321
+
2322
+ (void ) src1;
2323
+ (void ) dst;
2324
+ (void ) src0_ddf_i;
2325
+ (void ) i02;
2326
+ (void ) i1;
2327
+ }
2328
+
2261
2329
inline void ggml_cuda_op_mul_mat_cublas (
2262
2330
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
2263
2331
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@@ -2798,7 +2866,13 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
2798
2866
ggml_cuda_op (src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true , false );
2799
2867
} else if (ggml_is_quantized (src0->type ) || src0->type == GGML_TYPE_F16) {
2800
2868
if (src1->ne [1 ] == 1 && src0->ne [0 ] % GGML_CUDA_DMMV_X == 0 && src0->ne [1 ] % GGML_CUDA_DMMV_Y == 0 ) {
2801
- ggml_cuda_op (src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false , false );
2869
+ bool use_mul_mat_vec_q = false ;
2870
+ use_mul_mat_vec_q = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1;
2871
+ if (use_mul_mat_vec_q) {
2872
+ ggml_cuda_op (src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, false , false );
2873
+ } else {
2874
+ ggml_cuda_op (src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false , false );
2875
+ }
2802
2876
} else {
2803
2877
ggml_cuda_op (src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true , false );
2804
2878
}
0 commit comments