@@ -368,6 +368,7 @@ struct ggml_backend_opencl_context {
368
368
cl_program program_mul_mv_f16_f32;
369
369
cl_program program_mul_mv_f32_f32;
370
370
cl_program program_mul;
371
+ cl_program program_mul_mat_f16_f32_tiled;
371
372
cl_program program_div;
372
373
cl_program program_sub;
373
374
cl_program program_norm;
@@ -422,6 +423,7 @@ struct ggml_backend_opencl_context {
422
423
cl_kernel kernel_mul_mat_f16_f32_1row;
423
424
cl_kernel kernel_mul_mat_f16_f32;
424
425
cl_kernel kernel_mul_mat_f16_f32_l4;
426
+ cl_kernel kernel_mul_mat_f16_f32_tiled;
425
427
cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
426
428
cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
427
429
cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
@@ -1015,6 +1017,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1015
1017
GGML_LOG_CONT (" ." );
1016
1018
}
1017
1019
1020
+ // mul_mat_f16_f32_tiled
1021
+ {
1022
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1023
+ const std::string kernel_src {
1024
+ #include " mul_mat_f16_f32.cl.h"
1025
+ };
1026
+ #else
1027
+ const std::string kernel_src = read_file (" mul_mat_f16_f32.cl" );
1028
+ #endif
1029
+ backend_ctx->program_mul_mat_f16_f32_tiled =
1030
+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
1031
+
1032
+ CL_CHECK ((backend_ctx->kernel_mul_mat_f16_f32_tiled = clCreateKernel (backend_ctx->program_mul_mat_f16_f32_tiled , " mul_mat_f16_f32" , &err), err));
1033
+ GGML_LOG_CONT (" ." );
1034
+ }
1035
+
1018
1036
// mul
1019
1037
{
1020
1038
#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -4927,6 +4945,58 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor
4927
4945
backend_ctx->enqueue_ndrange_kernel (kernel, 3 , global_work_size, NULL , dst);
4928
4946
}
4929
4947
4948
+ static void ggml_cl_mul_mat_f16_f32_tiled (ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4949
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context ;
4950
+
4951
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra ;
4952
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra ;
4953
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra ;
4954
+
4955
+ cl_ulong offset0 = extra0->offset + src0->view_offs ;
4956
+ cl_ulong offset1 = extra1->offset + src1->view_offs ;
4957
+ cl_ulong offsetd = extrad->offset + dst->view_offs ;
4958
+
4959
+ const int M = src0->ne [1 ];
4960
+ const int N = src1->ne [1 ];
4961
+ const int K = src0->ne [0 ];
4962
+
4963
+ cl_kernel kernel = backend_ctx->kernel_mul_mat_f16_f32_tiled ;
4964
+
4965
+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (int ), &M));
4966
+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (int ), &N));
4967
+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (int ), &K));
4968
+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_mem), &extra0->data_device ));
4969
+ CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (cl_ulong), &offset0));
4970
+ CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (cl_mem), &extra1->data_device ));
4971
+ CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (cl_ulong), &offset1));
4972
+ CL_CHECK (clSetKernelArg (kernel, 7 , sizeof (cl_mem), &extrad->data_device ));
4973
+ CL_CHECK (clSetKernelArg (kernel, 8 , sizeof (cl_ulong), &offsetd));
4974
+
4975
+ // Tiling parameters. These need to be tuned for optimal performance.
4976
+ // They must match the #defines in the kernel mul_mat_f16_f32.cl.
4977
+ //
4978
+ // OPWM / OPWN: Output tile size per Work-Group. A work-group computes a tile of size OPWM x OPWN.
4979
+ // TPWM / TPWN: Threads per Work-group. This is the work-group size.
4980
+ // OPTM / OPTN: Output elements per Thread. Each thread computes OPTM x OPTN elements.
4981
+ //
4982
+ // The following relationships must hold:
4983
+ // OPWM = TPWM * OPTM
4984
+ // OPWN = TPWN * OPTN
4985
+ //
4986
+ const int OPWM = 64 ;
4987
+ const int OPWN = 64 ;
4988
+ const int TPWM = 16 ;
4989
+ const int TPWN = 8 ;
4990
+
4991
+ size_t local_work_size[2 ] = { TPWM, TPWN };
4992
+ size_t global_work_size[2 ] = {
4993
+ (size_t ) ((M + OPWM - 1 ) / OPWM) * TPWM,
4994
+ (size_t ) ((N + OPWN - 1 ) / OPWN) * TPWN,
4995
+ };
4996
+
4997
+ backend_ctx->enqueue_ndrange_kernel (kernel, 2 , global_work_size, local_work_size, dst);
4998
+ }
4999
+
4930
5000
static void ggml_cl_mul_mat (ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4931
5001
GGML_ASSERT (src0);
4932
5002
GGML_ASSERT (src0->extra );
@@ -4940,6 +5010,18 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
4940
5010
4941
5011
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context ;
4942
5012
5013
+ if (src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32 &&
5014
+ src0->ne [1 ] > 32 && // M > 32
5015
+ src1->ne [1 ] > 32 && // N > 32
5016
+ src0->ne [0 ] > 32 && // K > 32
5017
+ src0->ne [2 ] == 1 && src0->ne [3 ] == 1 &&
5018
+ src1->ne [2 ] == 1 && src1->ne [3 ] == 1 &&
5019
+ ggml_is_contiguous (src0) && ggml_is_contiguous (src1) &&
5020
+ backend_ctx->kernel_mul_mat_f16_f32_tiled != NULL ) {
5021
+ ggml_cl_mul_mat_f16_f32_tiled (backend, src0, src1, dst);
5022
+ return ;
5023
+ }
5024
+
4943
5025
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra ;
4944
5026
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra ;
4945
5027
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra ;
0 commit comments