From 418606e8f20da3505710743d6bdb98f2c08cf1d1 Mon Sep 17 00:00:00 2001 From: rmatif Date: Fri, 11 Jul 2025 10:50:01 +0000 Subject: [PATCH 1/3] add mul_mat_f16_f32_image kernel --- ggml/src/ggml-opencl/CMakeLists.txt | 3 + ggml/src/ggml-opencl/ggml-opencl.cpp | 178 ++++++++++++++++++ .../kernels/mul_mat_f16_f32_image.cl | 61 ++++++ .../ggml-opencl/kernels/pack_a_for_image.cl | 29 +++ .../ggml-opencl/kernels/pack_b_for_image.cl | 28 +++ 5 files changed, 299 insertions(+) create mode 100644 ggml/src/ggml-opencl/kernels/mul_mat_f16_f32_image.cl create mode 100644 ggml/src/ggml-opencl/kernels/pack_a_for_image.cl create mode 100644 ggml/src/ggml-opencl/kernels/pack_b_for_image.cl diff --git a/ggml/src/ggml-opencl/CMakeLists.txt b/ggml/src/ggml-opencl/CMakeLists.txt index ec5d8cf59556b..dd95443c9ee82 100644 --- a/ggml/src/ggml-opencl/CMakeLists.txt +++ b/ggml/src/ggml-opencl/CMakeLists.txt @@ -105,6 +105,9 @@ set(GGML_OPENCL_KERNELS pad repeat mul_mat_f16_f32 + mul_mat_f16_f32_image + pack_a_for_image + pack_b_for_image ) foreach (K ${GGML_OPENCL_KERNELS}) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 58830b733a8af..335352fc13ebc 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -331,6 +331,8 @@ struct ggml_backend_opencl_context { cl_int alignment; size_t max_alloc_size; + size_t max_image_width; + size_t max_image_height; bool fp16_support; bool has_vector_subgroup_broadcast; ggml_cl_compiler_version adreno_cl_compiler_version; @@ -369,6 +371,10 @@ struct ggml_backend_opencl_context { cl_program program_mul_mv_f32_f32; cl_program program_mul; cl_program program_mul_mat_f16_f32_tiled; + cl_program program_mul_mat_f16_f32_image; + cl_program program_pack_a_for_image; + cl_program program_pack_b_for_image; + cl_ulong global_mem_size; cl_program program_div; cl_program program_sub; cl_program program_norm; @@ -424,6 +430,9 @@ struct ggml_backend_opencl_context { cl_kernel kernel_mul_mat_f16_f32; cl_kernel kernel_mul_mat_f16_f32_l4; cl_kernel kernel_mul_mat_f16_f32_tiled; + cl_kernel kernel_mul_mat_f16_f32_image; + cl_kernel kernel_pack_a_for_image; + cl_kernel kernel_pack_b_for_image; cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v; cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0; cl_kernel kernel_mul_mat_q4_0_f32_8x_flat; @@ -1033,6 +1042,54 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve GGML_LOG_CONT("."); } + // mul_mat_f16_f32_image + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src{ +#include "mul_mat_f16_f32_image.cl.h" + }; +#else + const std::string kernel_src = read_file("mul_mat_f16_f32_image.cl"); +#endif + backend_ctx->program_mul_mat_f16_f32_image = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); + + CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_image = clCreateKernel(backend_ctx->program_mul_mat_f16_f32_image, "mul_mat_f16_f32_image", &err), err)); + GGML_LOG_CONT("."); + } + + // pack_a_for_image + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src{ +#include "pack_a_for_image.cl.h" + }; +#else + const std::string kernel_src = read_file("pack_a_for_image.cl"); +#endif + backend_ctx->program_pack_a_for_image = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); + + CL_CHECK((backend_ctx->kernel_pack_a_for_image = clCreateKernel(backend_ctx->program_pack_a_for_image, "pack_a_for_image", &err), err)); + GGML_LOG_CONT("."); + } + + // pack_b_for_image + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src{ +#include "pack_b_for_image.cl.h" + }; +#else + const std::string kernel_src = read_file("pack_b_for_image.cl"); +#endif + backend_ctx->program_pack_b_for_image = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); + + CL_CHECK((backend_ctx->kernel_pack_b_for_image = clCreateKernel(backend_ctx->program_pack_b_for_image, "pack_b_for_image", &err), err)); + GGML_LOG_CONT("."); + } + // mul { #ifdef GGML_OPENCL_EMBED_KERNELS @@ -1987,6 +2044,11 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL); GGML_LOG_INFO("ggml_opencl: max mem alloc size: %zu MB\n", backend_ctx->max_alloc_size/1024/1024); + CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &backend_ctx->global_mem_size, NULL)); + + CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &backend_ctx->max_image_width, NULL)); + CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &backend_ctx->max_image_height, NULL)); + // Check SVM. cl_device_svm_capabilities svm_caps; CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_SVM_CAPABILITIES, sizeof(cl_device_svm_capabilities), &svm_caps, 0)); @@ -4997,6 +5059,93 @@ static void ggml_cl_mul_mat_f16_f32_tiled(ggml_backend_t backend, const ggml_ten backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst); } +static void ggml_cl_mul_mat_f16_f32_image(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; + cl_context context = backend_ctx->context; + cl_command_queue queue = backend_ctx->queue; + cl_int err = 0; + + const int M = src0->ne[1]; + const int N = src1->ne[1]; + const int K = src0->ne[0]; + const int K_4 = (K + 3) / 4; + const int N_4 = (N + 3) / 4; + + ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; + ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; + ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; + + cl_ulong offset0 = extra0->offset + src0->view_offs; + cl_ulong offset1 = extra1->offset + src1->view_offs; + cl_ulong offsetd = extrad->offset + dst->view_offs; + + cl_mem a_image = NULL, b_image = NULL; + cl_event pack_events[2]; + cl_event matmul_event; + + // Create image for A + cl_image_format format_A = {CL_RGBA, CL_HALF_FLOAT}; + cl_image_desc desc_A = {}; + desc_A.image_type = CL_MEM_OBJECT_IMAGE2D; + desc_A.image_width = K_4; + desc_A.image_height = M; + a_image = clCreateImage(context, CL_MEM_READ_WRITE, &format_A, &desc_A, NULL, &err); + CL_CHECK(err); + + // Create image for B + cl_image_format format_B = {CL_RGBA, CL_HALF_FLOAT}; + cl_image_desc desc_B = {}; + desc_B.image_type = CL_MEM_OBJECT_IMAGE2D; + desc_B.image_width = N_4; + desc_B.image_height = K; + b_image = clCreateImage(context, CL_MEM_READ_WRITE, &format_B, &desc_B, NULL, &err); + CL_CHECK(err); + + // Launch packing kernel for A + cl_kernel pack_a_kernel = backend_ctx->kernel_pack_a_for_image; + CL_CHECK(clSetKernelArg(pack_a_kernel, 0, sizeof(cl_mem), &extra0->data_device)); + CL_CHECK(clSetKernelArg(pack_a_kernel, 1, sizeof(cl_ulong), &offset0)); + CL_CHECK(clSetKernelArg(pack_a_kernel, 2, sizeof(cl_mem), &a_image)); + CL_CHECK(clSetKernelArg(pack_a_kernel, 3, sizeof(int), &M)); + CL_CHECK(clSetKernelArg(pack_a_kernel, 4, sizeof(int), &K)); + const size_t pack_a_gws[2] = { (size_t)K_4, (size_t)M }; + CL_CHECK(clEnqueueNDRangeKernel(queue, pack_a_kernel, 2, NULL, pack_a_gws, NULL, 0, NULL, &pack_events[0])); + + // Launch packing kernel for B + cl_kernel pack_b_kernel = backend_ctx->kernel_pack_b_for_image; + CL_CHECK(clSetKernelArg(pack_b_kernel, 0, sizeof(cl_mem), &extra1->data_device)); + CL_CHECK(clSetKernelArg(pack_b_kernel, 1, sizeof(cl_ulong), &offset1)); + CL_CHECK(clSetKernelArg(pack_b_kernel, 2, sizeof(cl_mem), &b_image)); + CL_CHECK(clSetKernelArg(pack_b_kernel, 3, sizeof(int), &K)); + CL_CHECK(clSetKernelArg(pack_b_kernel, 4, sizeof(int), &N)); + const size_t pack_b_gws[2] = { (size_t)N_4, (size_t)K }; + CL_CHECK(clEnqueueNDRangeKernel(queue, pack_b_kernel, 2, NULL, pack_b_gws, NULL, 0, NULL, &pack_events[1])); + + // Launch matmul kernel + cl_kernel matmul_kernel = backend_ctx->kernel_mul_mat_f16_f32_image; + CL_CHECK(clSetKernelArg(matmul_kernel, 0, sizeof(cl_mem), &a_image)); + CL_CHECK(clSetKernelArg(matmul_kernel, 1, sizeof(cl_mem), &b_image)); + CL_CHECK(clSetKernelArg(matmul_kernel, 2, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(matmul_kernel, 3, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(matmul_kernel, 4, sizeof(int), &M)); + CL_CHECK(clSetKernelArg(matmul_kernel, 5, sizeof(int), &N)); + CL_CHECK(clSetKernelArg(matmul_kernel, 6, sizeof(int), &K)); + + const int OPWM = 64; + const int OPWN = 64; + const size_t lws[2] = { 16, 8 }; // WG_M, WG_N + const size_t gws[2] = { (size_t)ceil((float)M / OPWM) * lws[0], (size_t)ceil((float)N / OPWN) * lws[1] }; + CL_CHECK(clEnqueueNDRangeKernel(queue, matmul_kernel, 2, NULL, gws, lws, 2, pack_events, &matmul_event)); + + // Wait for matmul to finish and release resources + CL_CHECK(clWaitForEvents(1, &matmul_event)); + CL_CHECK(clReleaseEvent(pack_events[0])); + CL_CHECK(clReleaseEvent(pack_events[1])); + CL_CHECK(clReleaseEvent(matmul_event)); + CL_CHECK(clReleaseMemObject(a_image)); + CL_CHECK(clReleaseMemObject(b_image)); +} + static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); @@ -5010,6 +5159,35 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; + if (src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32 && + backend_ctx->gpu_family == ADRENO && backend_ctx->kernel_mul_mat_f16_f32_image != NULL && + ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && + src0->ne[2] == 1 && src0->ne[3] == 1 && + src1->ne[2] == 1 && src1->ne[3] == 1) { + + const int M = src0->ne[1]; + const int N = src1->ne[1]; + const int K = src0->ne[0]; + + // Performance thresholds: only use for reasonably large matrices + // where the GPU speedup can outweigh the CPU-side transpose/packing overhead. + if (M > 32 && N > 32 && K > 32) { + const size_t n_padded_4 = (size_t)((N + 3) / 4); + const size_t temp_a_size = (size_t)M * K * sizeof(ggml_fp16_t); + const size_t temp_b_size = n_padded_4 * K * 4 * sizeof(ggml_fp16_t); // RGBA + const size_t total_temp_image_size = temp_a_size + temp_b_size; + + // Safety checks for memory and device limits + if ((size_t)K <= backend_ctx->max_image_width && + (size_t)M <= backend_ctx->max_image_height && + n_padded_4 <= backend_ctx->max_image_height && + total_temp_image_size < (backend_ctx->global_mem_size / 4)) { // Ensure temp images use < 25% of total VRAM + ggml_cl_mul_mat_f16_f32_image(backend, src0, src1, dst); + return; + } + } + } + if (src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32 && src0->ne[1] > 32 && // M > 32 src1->ne[1] > 32 && // N > 32 diff --git a/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32_image.cl b/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32_image.cl new file mode 100644 index 0000000000000..51c93090482df --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32_image.cl @@ -0,0 +1,61 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +__constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + +__kernel void mul_mat_f16_f32_image( + __read_only image2d_t A_img, + __read_only image2d_t B_img, + __global float* C_buf, + const ulong c_offset, + const int M, + const int N, + const int K +) { + const int n_4_idx = get_global_id(0); + const int m_idx = get_global_id(1); + + const int n_base = n_4_idx << 2; + + if (n_base >= N || m_idx >= M) { + return; + } + + float4 c_vals = (float4)(0.0f); + const int K_4 = (K + 3) / 4; + + for (int k_4_idx = 0; k_4_idx < K_4; ++k_4_idx) { + int k_base = k_4_idx << 2; + + float4 a_vals = convert_float4(read_imageh(A_img, SAMPLER, (int2)(k_4_idx, m_idx))); + + if (k_base < K) { + float4 b0 = convert_float4(read_imageh(B_img, SAMPLER, (int2)(n_4_idx, k_base + 0))); + c_vals = mad(a_vals.x, b0, c_vals); + } + if (k_base + 1 < K) { + float4 b1 = convert_float4(read_imageh(B_img, SAMPLER, (int2)(n_4_idx, k_base + 1))); + c_vals = mad(a_vals.y, b1, c_vals); + } + if (k_base + 2 < K) { + float4 b2 = convert_float4(read_imageh(B_img, SAMPLER, (int2)(n_4_idx, k_base + 2))); + c_vals = mad(a_vals.z, b2, c_vals); + } + if (k_base + 3 < K) { + float4 b3 = convert_float4(read_imageh(B_img, SAMPLER, (int2)(n_4_idx, k_base + 3))); + c_vals = mad(a_vals.w, b3, c_vals); + } + } + + __global float* C = (__global float*)((__global char*)C_buf + c_offset); + + if (n_base + 3 < N) { + C[(n_base + 0) * M + m_idx] = c_vals.x; + C[(n_base + 1) * M + m_idx] = c_vals.y; + C[(n_base + 2) * M + m_idx] = c_vals.z; + C[(n_base + 3) * M + m_idx] = c_vals.w; + } else { + if (n_base < N) C[n_base * M + m_idx] = c_vals.x; + if (n_base + 1 < N) C[(n_base + 1) * M + m_idx] = c_vals.y; + if (n_base + 2 < N) C[(n_base + 2) * M + m_idx] = c_vals.z; + } +} diff --git a/ggml/src/ggml-opencl/kernels/pack_a_for_image.cl b/ggml/src/ggml-opencl/kernels/pack_a_for_image.cl new file mode 100644 index 0000000000000..acad3db9f7508 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/pack_a_for_image.cl @@ -0,0 +1,29 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +__kernel void pack_a_for_image( + __global const half* src_a, + const ulong a_offset, + __write_only image2d_t dest_img, + const int M, + const int K +) { + const int k_4_idx = get_global_id(0); + const int m_idx = get_global_id(1); + + const int k_base = k_4_idx << 2; + + if (k_base >= K || m_idx >= M) { + return; + } + + __global const half* a_ptr = (__global const half*)((__global const char*)src_a + a_offset); + const int a_idx_base = m_idx * K + k_base; + + half4 vals; + vals.x = a_ptr[a_idx_base]; + vals.y = (k_base + 1 < K) ? a_ptr[a_idx_base + 1] : (half)0.0h; + vals.z = (k_base + 2 < K) ? a_ptr[a_idx_base + 2] : (half)0.0h; + vals.w = (k_base + 3 < K) ? a_ptr[a_idx_base + 3] : (half)0.0h; + + write_imageh(dest_img, (int2)(k_4_idx, m_idx), vals); +} diff --git a/ggml/src/ggml-opencl/kernels/pack_b_for_image.cl b/ggml/src/ggml-opencl/kernels/pack_b_for_image.cl new file mode 100644 index 0000000000000..7e9636b155522 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/pack_b_for_image.cl @@ -0,0 +1,28 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +__kernel void pack_b_for_image( + __global const float* src_b, + const ulong b_offset, + __write_only image2d_t dest_img, + const int K, + const int N +) { + const int n_4_idx = get_global_id(0); + const int k_idx = get_global_id(1); + + const int n_base = n_4_idx << 2; + + if (n_base >= N || k_idx >= K) { + return; + } + + __global const float* b_ptr = (__global const float*)((__global const char*)src_b + b_offset); + + half4 vals; + vals.x = convert_half(b_ptr[n_base * K + k_idx]); + vals.y = (n_base + 1 < N) ? convert_half(b_ptr[(n_base + 1) * K + k_idx]) : (half)0.0h; + vals.z = (n_base + 2 < N) ? convert_half(b_ptr[(n_base + 2) * K + k_idx]) : (half)0.0h; + vals.w = (n_base + 3 < N) ? convert_half(b_ptr[(n_base + 3) * K + k_idx]) : (half)0.0h; + + write_imageh(dest_img, (int2)(n_4_idx, k_idx), vals); +} From 18310bf2026000a7e0594baee9f0412b7962e06c Mon Sep 17 00:00:00 2001 From: rmatif Date: Fri, 11 Jul 2025 13:13:54 +0000 Subject: [PATCH 2/3] fix trailing whitespace --- ggml/src/ggml-opencl/kernels/mul_mat_f16_f32_image.cl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32_image.cl b/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32_image.cl index 51c93090482df..0df41bcd2e710 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32_image.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32_image.cl @@ -47,7 +47,7 @@ __kernel void mul_mat_f16_f32_image( } __global float* C = (__global float*)((__global char*)C_buf + c_offset); - + if (n_base + 3 < N) { C[(n_base + 0) * M + m_idx] = c_vals.x; C[(n_base + 1) * M + m_idx] = c_vals.y; From f68669d50fe762763f64686233c2d7267901212c Mon Sep 17 00:00:00 2001 From: rmatif Date: Tue, 15 Jul 2025 11:28:26 +0000 Subject: [PATCH 3/3] fix and opt kernel launch --- ggml/src/ggml-opencl/ggml-opencl.cpp | 42 +++++++++++++--------------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 335352fc13ebc..f29363d9593a7 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -5062,7 +5062,6 @@ static void ggml_cl_mul_mat_f16_f32_tiled(ggml_backend_t backend, const ggml_ten static void ggml_cl_mul_mat_f16_f32_image(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; cl_context context = backend_ctx->context; - cl_command_queue queue = backend_ctx->queue; cl_int err = 0; const int M = src0->ne[1]; @@ -5079,26 +5078,23 @@ static void ggml_cl_mul_mat_f16_f32_image(ggml_backend_t backend, const ggml_ten cl_ulong offset1 = extra1->offset + src1->view_offs; cl_ulong offsetd = extrad->offset + dst->view_offs; - cl_mem a_image = NULL, b_image = NULL; - cl_event pack_events[2]; - cl_event matmul_event; + cl_image_format format = {CL_RGBA, CL_HALF_FLOAT}; + cl_mem a_image, b_image; // Create image for A - cl_image_format format_A = {CL_RGBA, CL_HALF_FLOAT}; cl_image_desc desc_A = {}; desc_A.image_type = CL_MEM_OBJECT_IMAGE2D; desc_A.image_width = K_4; desc_A.image_height = M; - a_image = clCreateImage(context, CL_MEM_READ_WRITE, &format_A, &desc_A, NULL, &err); + a_image = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc_A, NULL, &err); CL_CHECK(err); // Create image for B - cl_image_format format_B = {CL_RGBA, CL_HALF_FLOAT}; cl_image_desc desc_B = {}; desc_B.image_type = CL_MEM_OBJECT_IMAGE2D; desc_B.image_width = N_4; desc_B.image_height = K; - b_image = clCreateImage(context, CL_MEM_READ_WRITE, &format_B, &desc_B, NULL, &err); + b_image = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc_B, NULL, &err); CL_CHECK(err); // Launch packing kernel for A @@ -5108,8 +5104,8 @@ static void ggml_cl_mul_mat_f16_f32_image(ggml_backend_t backend, const ggml_ten CL_CHECK(clSetKernelArg(pack_a_kernel, 2, sizeof(cl_mem), &a_image)); CL_CHECK(clSetKernelArg(pack_a_kernel, 3, sizeof(int), &M)); CL_CHECK(clSetKernelArg(pack_a_kernel, 4, sizeof(int), &K)); - const size_t pack_a_gws[2] = { (size_t)K_4, (size_t)M }; - CL_CHECK(clEnqueueNDRangeKernel(queue, pack_a_kernel, 2, NULL, pack_a_gws, NULL, 0, NULL, &pack_events[0])); + size_t pack_a_gws[2] = { (size_t)K_4, (size_t)M }; + backend_ctx->enqueue_ndrange_kernel(pack_a_kernel, 2, pack_a_gws, NULL, src0); // Launch packing kernel for B cl_kernel pack_b_kernel = backend_ctx->kernel_pack_b_for_image; @@ -5118,8 +5114,8 @@ static void ggml_cl_mul_mat_f16_f32_image(ggml_backend_t backend, const ggml_ten CL_CHECK(clSetKernelArg(pack_b_kernel, 2, sizeof(cl_mem), &b_image)); CL_CHECK(clSetKernelArg(pack_b_kernel, 3, sizeof(int), &K)); CL_CHECK(clSetKernelArg(pack_b_kernel, 4, sizeof(int), &N)); - const size_t pack_b_gws[2] = { (size_t)N_4, (size_t)K }; - CL_CHECK(clEnqueueNDRangeKernel(queue, pack_b_kernel, 2, NULL, pack_b_gws, NULL, 0, NULL, &pack_events[1])); + size_t pack_b_gws[2] = { (size_t)N_4, (size_t)K }; + backend_ctx->enqueue_ndrange_kernel(pack_b_kernel, 2, pack_b_gws, NULL, src1); // Launch matmul kernel cl_kernel matmul_kernel = backend_ctx->kernel_mul_mat_f16_f32_image; @@ -5131,17 +5127,17 @@ static void ggml_cl_mul_mat_f16_f32_image(ggml_backend_t backend, const ggml_ten CL_CHECK(clSetKernelArg(matmul_kernel, 5, sizeof(int), &N)); CL_CHECK(clSetKernelArg(matmul_kernel, 6, sizeof(int), &K)); - const int OPWM = 64; - const int OPWN = 64; - const size_t lws[2] = { 16, 8 }; // WG_M, WG_N - const size_t gws[2] = { (size_t)ceil((float)M / OPWM) * lws[0], (size_t)ceil((float)N / OPWN) * lws[1] }; - CL_CHECK(clEnqueueNDRangeKernel(queue, matmul_kernel, 2, NULL, gws, lws, 2, pack_events, &matmul_event)); - - // Wait for matmul to finish and release resources - CL_CHECK(clWaitForEvents(1, &matmul_event)); - CL_CHECK(clReleaseEvent(pack_events[0])); - CL_CHECK(clReleaseEvent(pack_events[1])); - CL_CHECK(clReleaseEvent(matmul_event)); + size_t lws[2] = { 16, 8 }; + const size_t req_gws_x = (size_t)N_4; + const size_t req_gws_y = (size_t)M; + size_t gws[2] = { + (req_gws_x + lws[0] - 1) / lws[0] * lws[0], + (req_gws_y + lws[1] - 1) / lws[1] * lws[1], + }; + backend_ctx->enqueue_ndrange_kernel(matmul_kernel, 2, gws, lws, dst); + + // Release resources. The OpenCL runtime will ensure kernels are finished + // before releasing the memory objects. CL_CHECK(clReleaseMemObject(a_image)); CL_CHECK(clReleaseMemObject(b_image)); }