@@ -6607,27 +6607,20 @@ inline void ggml_cuda_op_mul_mat_cublas(
6607
6607
to_fp16_cuda (src1_ddf_i, src1_as_f16, ne, stream);
6608
6608
}
6609
6609
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16;
6610
- size_t dst_as = 0 ;
6611
- half * dst_f16 = (half *) ggml_cuda_pool_malloc (row_diff*src1_ncols * sizeof (half), &dst_as);
6612
6610
6613
- const half alpha_f16 = 1 .0f ;
6614
- const half beta_f16 = 0 .0f ;
6611
+ const float alpha = 1 .0f ;
6612
+ const float beta = 0 .0f ;
6615
6613
6616
6614
CUBLAS_CHECK (cublasSetStream (g_cublas_handles[id], stream));
6617
6615
CUBLAS_CHECK (
6618
6616
cublasGemmEx (g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
6619
6617
row_diff, src1_ncols, ne10,
6620
- &alpha_f16 , src0_ptr, CUDA_R_16F, ne00,
6621
- src1_ptr, CUDA_R_16F, ne10,
6622
- &beta_f16 , dst_f16, CUDA_R_16F , ldc,
6623
- CUBLAS_COMPUTE_16F ,
6618
+ &alpha , src0_ptr, CUDA_R_16F, ne00,
6619
+ src1_ptr, CUDA_R_16F, ne10,
6620
+ &beta , dst_dd_i, CUDA_R_32F , ldc,
6621
+ CUBLAS_COMPUTE_32F ,
6624
6622
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
6625
6623
6626
- const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda (GGML_TYPE_F16);
6627
- to_fp32_cuda (dst_f16, dst_dd_i, row_diff*src1_ncols, stream);
6628
-
6629
- ggml_cuda_pool_free (dst_f16, dst_as);
6630
-
6631
6624
if (src0_as != 0 ) {
6632
6625
ggml_cuda_pool_free (src0_as_f16, src0_as);
6633
6626
}
@@ -7436,7 +7429,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
7436
7429
}
7437
7430
7438
7431
__global__ static void k_compute_batched_ptrs (
7439
- const half * src0_as_f16, const half * src1_as_f16, half * dst_f16 ,
7432
+ const half * src0_as_f16, const half * src1_as_f16, float * dst_f32 ,
7440
7433
const void ** ptrs_src, void ** ptrs_dst,
7441
7434
int ne12, int ne13,
7442
7435
int ne23,
@@ -7456,7 +7449,7 @@ __global__ static void k_compute_batched_ptrs(
7456
7449
7457
7450
ptrs_src[0 *ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
7458
7451
ptrs_src[1 *ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2 ;
7459
- ptrs_dst[0 *ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/ 2 + i13* nb3/ 2 ;
7452
+ ptrs_dst[0 *ne23 + i12 + i13*ne12] = ( char *) dst_f32 + i12* nb2 + i13* nb3 ;
7460
7453
}
7461
7454
7462
7455
static void ggml_cuda_mul_mat_mat_batched_cublas (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -7513,18 +7506,15 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7513
7506
half * src1_as_f16 = (half *) ggml_cuda_pool_malloc (ne1 * sizeof (half), &src1_as);
7514
7507
to_fp16_cuda (src1_ddf, src1_as_f16, ne1, main_stream);
7515
7508
7516
- size_t dst_as = 0 ;
7517
- half * dst_f16 = (half *) ggml_cuda_pool_malloc (ne * sizeof (half), &dst_as);
7518
-
7519
7509
GGML_ASSERT (ne12 % ne02 == 0 );
7520
7510
GGML_ASSERT (ne13 % ne03 == 0 );
7521
7511
7522
7512
// broadcast factors
7523
7513
const int64_t r2 = ne12/ne02;
7524
7514
const int64_t r3 = ne13/ne03;
7525
7515
7526
- const half alpha_f16 = 1 .0f ;
7527
- const half beta_f16 = 0 .0f ;
7516
+ const float alpha = 1 .0f ;
7517
+ const float beta = 0 .0f ;
7528
7518
7529
7519
#if 0
7530
7520
// use cublasGemmEx
@@ -7537,10 +7527,10 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7537
7527
CUBLAS_CHECK(
7538
7528
cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7539
7529
ne01, ne11, ne10,
7540
- &alpha_f16 , (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
7541
- (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
7542
- &beta_f16 , ( char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2, CUDA_R_16F , ne01,
7543
- CUBLAS_COMPUTE_16F ,
7530
+ &alpha , (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
7531
+ (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
7532
+ &beta , ( char *) dst_ddf + i12* dst->nb[2] + i13* dst->nb[3] , CUDA_R_32F , ne01,
7533
+ CUBLAS_COMPUTE_32F ,
7544
7534
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7545
7535
}
7546
7536
}
@@ -7552,11 +7542,11 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7552
7542
CUBLAS_CHECK (
7553
7543
cublasGemmStridedBatchedEx (g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7554
7544
ne01, ne11, ne10,
7555
- &alpha_f16 , (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof (half), src0->nb [2 ]/sizeof (half), // strideA
7556
- (const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof (float ), src1->nb [2 ]/sizeof (float ), // strideB
7557
- &beta_f16 , ( char *) dst_f16, CUDA_R_16F , ne01, dst->nb [2 ]/sizeof (float ), // strideC
7545
+ &alpha , (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof (half), src0->nb [2 ]/sizeof (half), // strideA
7546
+ (const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof (float ), src1->nb [2 ]/sizeof (float ), // strideB
7547
+ &beta , ( char *) dst_ddf, CUDA_R_32F , ne01, dst->nb [2 ]/sizeof (float ), // strideC
7558
7548
ne12*ne13,
7559
- CUBLAS_COMPUTE_16F ,
7549
+ CUBLAS_COMPUTE_32F ,
7560
7550
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7561
7551
} else {
7562
7552
// use cublasGemmBatchedEx
@@ -7573,7 +7563,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7573
7563
7574
7564
dim3 block_dims (ne13, ne12);
7575
7565
k_compute_batched_ptrs<<<1 , block_dims, 0 , main_stream>>> (
7576
- src0_as_f16, src1_as_f16, dst_f16 ,
7566
+ src0_as_f16, src1_as_f16, dst_ddf ,
7577
7567
ptrs_src, ptrs_dst,
7578
7568
ne12, ne13,
7579
7569
ne23,
@@ -7586,11 +7576,11 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7586
7576
CUBLAS_CHECK (
7587
7577
cublasGemmBatchedEx (g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7588
7578
ne01, ne11, ne10,
7589
- &alpha_f16 , (const void **) (ptrs_src + 0 *ne23), CUDA_R_16F, nb01/sizeof (half),
7590
- (const void **) (ptrs_src + 1 *ne23), CUDA_R_16F, nb11/sizeof (float ),
7591
- &beta_f16 , ( void **) (ptrs_dst + 0 *ne23), CUDA_R_16F , ne01,
7579
+ &alpha , (const void **) (ptrs_src + 0 *ne23), CUDA_R_16F, nb01/sizeof (half),
7580
+ (const void **) (ptrs_src + 1 *ne23), CUDA_R_16F, nb11/sizeof (float ),
7581
+ &beta , ( void **) (ptrs_dst + 0 *ne23), CUDA_R_32F , ne01,
7592
7582
ne23,
7593
- CUBLAS_COMPUTE_16F ,
7583
+ CUBLAS_COMPUTE_32F ,
7594
7584
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7595
7585
7596
7586
if (ptrs_src_s != 0 ) {
@@ -7602,11 +7592,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7602
7592
}
7603
7593
#endif
7604
7594
7605
- const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda (GGML_TYPE_F16);
7606
- to_fp32_cuda (dst_f16, dst_ddf, ne, main_stream);
7607
-
7608
7595
ggml_cuda_pool_free (src1_as_f16, src1_as);
7609
- ggml_cuda_pool_free (dst_f16, dst_as);
7610
7596
}
7611
7597
7612
7598
static void ggml_cuda_mul_mat (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
0 commit comments