@@ -6334,7 +6334,6 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
6334
6334
GGML_ASSERT (src0->backend != GGML_BACKEND_GPU_SPLIT);
6335
6335
GGML_ASSERT (src0->nb [0 ] <= src0->nb [1 ] && src0->nb [2 ] <= src0->nb [3 ]); // 0213 permutation
6336
6336
GGML_ASSERT (src1->nb [0 ] <= src1->nb [1 ] && src1->nb [2 ] <= src1->nb [3 ]); // 0213 permutation
6337
- GGML_ASSERT (src0->type == GGML_TYPE_F16);
6338
6337
GGML_ASSERT (src1->type == GGML_TYPE_F32);
6339
6338
6340
6339
const int64_t ne00 = src0->ne [0 ];
@@ -6347,15 +6346,21 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
6347
6346
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
6348
6347
6349
6348
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra ;
6350
- void * src0_ddq = src0_extra->data_device [g_main_device];
6349
+ char * src0_ddq = ( char *) src0_extra->data_device [g_main_device];
6351
6350
6352
6351
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra ;
6353
6352
float * src1_ddf = (float *) src1_extra->data_device [g_main_device];
6354
6353
6355
6354
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra ;
6356
6355
float * dst_ddf = (float *) dst_extra->data_device [g_main_device];
6357
6356
6358
- ggml_mul_mat_p021_f16_f32_cuda (src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, cudaStream_main);
6357
+ if (src0->type == GGML_TYPE_F16) {
6358
+ ggml_mul_mat_p021_f16_f32_cuda (src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, cudaStream_main);
6359
+ } else if (ggml_is_quantized (src0->type )) {
6360
+ ggml_cuda_op_mul_mat_vec (src0, src1, dst, src0_ddq, nullptr , src1_ddf, dst_ddf, 0 , 0 , ne01, 0 , cudaStream_main);
6361
+ } else {
6362
+ GGML_ASSERT (false );
6363
+ }
6359
6364
}
6360
6365
6361
6366
void ggml_cuda_mul_mat_vec_nc (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
@@ -6402,7 +6407,7 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
6402
6407
src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
6403
6408
const bool src0_is_quantized = ggml_is_quantized (src0->type );
6404
6409
6405
- if (all_on_device && !src0_is_quantized && ggml_is_permuted (src0) && ggml_is_permuted (src1) && src1->ne [1 ] == 1 ) {
6410
+ if (all_on_device && ggml_is_permuted (src0) && ggml_is_permuted (src1) && src1->ne [1 ] == 1 ) {
6406
6411
ggml_cuda_mul_mat_vec_p021 (src0, src1, dst);
6407
6412
} else if (all_on_device && !ggml_is_contiguous (src0) && ggml_is_contiguous (src1) && src1->ne [1 ] == 1 ) {
6408
6413
ggml_cuda_mul_mat_vec_nc (src0, src1, dst);
0 commit comments