Skip to content

OpenCL: add mul_mat_f16_f32_image kernel #14635

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions ggml/src/ggml-opencl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,9 @@ set(GGML_OPENCL_KERNELS
pad
repeat
mul_mat_f16_f32
mul_mat_f16_f32_image
pack_a_for_image
pack_b_for_image
)

foreach (K ${GGML_OPENCL_KERNELS})
Expand Down
174 changes: 174 additions & 0 deletions ggml/src/ggml-opencl/ggml-opencl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,8 @@ struct ggml_backend_opencl_context {

cl_int alignment;
size_t max_alloc_size;
size_t max_image_width;
size_t max_image_height;
bool fp16_support;
bool has_vector_subgroup_broadcast;
ggml_cl_compiler_version adreno_cl_compiler_version;
Expand Down Expand Up @@ -369,6 +371,10 @@ struct ggml_backend_opencl_context {
cl_program program_mul_mv_f32_f32;
cl_program program_mul;
cl_program program_mul_mat_f16_f32_tiled;
cl_program program_mul_mat_f16_f32_image;
cl_program program_pack_a_for_image;
cl_program program_pack_b_for_image;
cl_ulong global_mem_size;
cl_program program_div;
cl_program program_sub;
cl_program program_norm;
Expand Down Expand Up @@ -424,6 +430,9 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_mul_mat_f16_f32;
cl_kernel kernel_mul_mat_f16_f32_l4;
cl_kernel kernel_mul_mat_f16_f32_tiled;
cl_kernel kernel_mul_mat_f16_f32_image;
cl_kernel kernel_pack_a_for_image;
cl_kernel kernel_pack_b_for_image;
cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
Expand Down Expand Up @@ -1033,6 +1042,54 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
GGML_LOG_CONT(".");
}

// mul_mat_f16_f32_image
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src{
#include "mul_mat_f16_f32_image.cl.h"
};
#else
const std::string kernel_src = read_file("mul_mat_f16_f32_image.cl");
#endif
backend_ctx->program_mul_mat_f16_f32_image =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);

CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_image = clCreateKernel(backend_ctx->program_mul_mat_f16_f32_image, "mul_mat_f16_f32_image", &err), err));
GGML_LOG_CONT(".");
}

// pack_a_for_image
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src{
#include "pack_a_for_image.cl.h"
};
#else
const std::string kernel_src = read_file("pack_a_for_image.cl");
#endif
backend_ctx->program_pack_a_for_image =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);

CL_CHECK((backend_ctx->kernel_pack_a_for_image = clCreateKernel(backend_ctx->program_pack_a_for_image, "pack_a_for_image", &err), err));
GGML_LOG_CONT(".");
}

// pack_b_for_image
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src{
#include "pack_b_for_image.cl.h"
};
#else
const std::string kernel_src = read_file("pack_b_for_image.cl");
#endif
backend_ctx->program_pack_b_for_image =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);

CL_CHECK((backend_ctx->kernel_pack_b_for_image = clCreateKernel(backend_ctx->program_pack_b_for_image, "pack_b_for_image", &err), err));
GGML_LOG_CONT(".");
}

// mul
{
#ifdef GGML_OPENCL_EMBED_KERNELS
Expand Down Expand Up @@ -1987,6 +2044,11 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL);
GGML_LOG_INFO("ggml_opencl: max mem alloc size: %zu MB\n", backend_ctx->max_alloc_size/1024/1024);

CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &backend_ctx->global_mem_size, NULL));

CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &backend_ctx->max_image_width, NULL));
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &backend_ctx->max_image_height, NULL));

// Check SVM.
cl_device_svm_capabilities svm_caps;
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_SVM_CAPABILITIES, sizeof(cl_device_svm_capabilities), &svm_caps, 0));
Expand Down Expand Up @@ -4997,6 +5059,89 @@ static void ggml_cl_mul_mat_f16_f32_tiled(ggml_backend_t backend, const ggml_ten
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst);
}

static void ggml_cl_mul_mat_f16_f32_image(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
cl_context context = backend_ctx->context;
cl_int err = 0;

const int M = src0->ne[1];
const int N = src1->ne[1];
const int K = src0->ne[0];
const int K_4 = (K + 3) / 4;
const int N_4 = (N + 3) / 4;

ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;

cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offset1 = extra1->offset + src1->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;

cl_image_format format = {CL_RGBA, CL_HALF_FLOAT};
cl_mem a_image, b_image;

// Create image for A
cl_image_desc desc_A = {};
desc_A.image_type = CL_MEM_OBJECT_IMAGE2D;
desc_A.image_width = K_4;
desc_A.image_height = M;
a_image = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc_A, NULL, &err);
CL_CHECK(err);

// Create image for B
cl_image_desc desc_B = {};
desc_B.image_type = CL_MEM_OBJECT_IMAGE2D;
desc_B.image_width = N_4;
desc_B.image_height = K;
b_image = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc_B, NULL, &err);
CL_CHECK(err);

// Launch packing kernel for A
cl_kernel pack_a_kernel = backend_ctx->kernel_pack_a_for_image;
CL_CHECK(clSetKernelArg(pack_a_kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(pack_a_kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(pack_a_kernel, 2, sizeof(cl_mem), &a_image));
CL_CHECK(clSetKernelArg(pack_a_kernel, 3, sizeof(int), &M));
CL_CHECK(clSetKernelArg(pack_a_kernel, 4, sizeof(int), &K));
size_t pack_a_gws[2] = { (size_t)K_4, (size_t)M };
backend_ctx->enqueue_ndrange_kernel(pack_a_kernel, 2, pack_a_gws, NULL, src0);

// Launch packing kernel for B
cl_kernel pack_b_kernel = backend_ctx->kernel_pack_b_for_image;
CL_CHECK(clSetKernelArg(pack_b_kernel, 0, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(pack_b_kernel, 1, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(pack_b_kernel, 2, sizeof(cl_mem), &b_image));
CL_CHECK(clSetKernelArg(pack_b_kernel, 3, sizeof(int), &K));
CL_CHECK(clSetKernelArg(pack_b_kernel, 4, sizeof(int), &N));
size_t pack_b_gws[2] = { (size_t)N_4, (size_t)K };
backend_ctx->enqueue_ndrange_kernel(pack_b_kernel, 2, pack_b_gws, NULL, src1);

// Launch matmul kernel
cl_kernel matmul_kernel = backend_ctx->kernel_mul_mat_f16_f32_image;
CL_CHECK(clSetKernelArg(matmul_kernel, 0, sizeof(cl_mem), &a_image));
CL_CHECK(clSetKernelArg(matmul_kernel, 1, sizeof(cl_mem), &b_image));
CL_CHECK(clSetKernelArg(matmul_kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(matmul_kernel, 3, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(matmul_kernel, 4, sizeof(int), &M));
CL_CHECK(clSetKernelArg(matmul_kernel, 5, sizeof(int), &N));
CL_CHECK(clSetKernelArg(matmul_kernel, 6, sizeof(int), &K));

size_t lws[2] = { 16, 8 };
const size_t req_gws_x = (size_t)N_4;
const size_t req_gws_y = (size_t)M;
size_t gws[2] = {
(req_gws_x + lws[0] - 1) / lws[0] * lws[0],
(req_gws_y + lws[1] - 1) / lws[1] * lws[1],
};
backend_ctx->enqueue_ndrange_kernel(matmul_kernel, 2, gws, lws, dst);

// Release resources. The OpenCL runtime will ensure kernels are finished
// before releasing the memory objects.
CL_CHECK(clReleaseMemObject(a_image));
CL_CHECK(clReleaseMemObject(b_image));
}

static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
Expand All @@ -5010,6 +5155,35 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co

ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;

if (src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32 &&
backend_ctx->gpu_family == ADRENO && backend_ctx->kernel_mul_mat_f16_f32_image != NULL &&
ggml_is_contiguous(src0) && ggml_is_contiguous(src1) &&
src0->ne[2] == 1 && src0->ne[3] == 1 &&
src1->ne[2] == 1 && src1->ne[3] == 1) {

const int M = src0->ne[1];
const int N = src1->ne[1];
const int K = src0->ne[0];

// Performance thresholds: only use for reasonably large matrices
// where the GPU speedup can outweigh the CPU-side transpose/packing overhead.
if (M > 32 && N > 32 && K > 32) {
const size_t n_padded_4 = (size_t)((N + 3) / 4);
const size_t temp_a_size = (size_t)M * K * sizeof(ggml_fp16_t);
const size_t temp_b_size = n_padded_4 * K * 4 * sizeof(ggml_fp16_t); // RGBA
const size_t total_temp_image_size = temp_a_size + temp_b_size;

// Safety checks for memory and device limits
if ((size_t)K <= backend_ctx->max_image_width &&
(size_t)M <= backend_ctx->max_image_height &&
n_padded_4 <= backend_ctx->max_image_height &&
total_temp_image_size < (backend_ctx->global_mem_size / 4)) { // Ensure temp images use < 25% of total VRAM
ggml_cl_mul_mat_f16_f32_image(backend, src0, src1, dst);
return;
}
}
}

if (src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32 &&
src0->ne[1] > 32 && // M > 32
src1->ne[1] > 32 && // N > 32
Expand Down
61 changes: 61 additions & 0 deletions ggml/src/ggml-opencl/kernels/mul_mat_f16_f32_image.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable

__constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

__kernel void mul_mat_f16_f32_image(
__read_only image2d_t A_img,
__read_only image2d_t B_img,
__global float* C_buf,
const ulong c_offset,
const int M,
const int N,
const int K
) {
const int n_4_idx = get_global_id(0);
const int m_idx = get_global_id(1);

const int n_base = n_4_idx << 2;

if (n_base >= N || m_idx >= M) {
return;
}

float4 c_vals = (float4)(0.0f);
const int K_4 = (K + 3) / 4;

for (int k_4_idx = 0; k_4_idx < K_4; ++k_4_idx) {
int k_base = k_4_idx << 2;

float4 a_vals = convert_float4(read_imageh(A_img, SAMPLER, (int2)(k_4_idx, m_idx)));

if (k_base < K) {
float4 b0 = convert_float4(read_imageh(B_img, SAMPLER, (int2)(n_4_idx, k_base + 0)));
c_vals = mad(a_vals.x, b0, c_vals);
}
if (k_base + 1 < K) {
float4 b1 = convert_float4(read_imageh(B_img, SAMPLER, (int2)(n_4_idx, k_base + 1)));
c_vals = mad(a_vals.y, b1, c_vals);
}
if (k_base + 2 < K) {
float4 b2 = convert_float4(read_imageh(B_img, SAMPLER, (int2)(n_4_idx, k_base + 2)));
c_vals = mad(a_vals.z, b2, c_vals);
}
if (k_base + 3 < K) {
float4 b3 = convert_float4(read_imageh(B_img, SAMPLER, (int2)(n_4_idx, k_base + 3)));
c_vals = mad(a_vals.w, b3, c_vals);
}
}

__global float* C = (__global float*)((__global char*)C_buf + c_offset);

if (n_base + 3 < N) {
C[(n_base + 0) * M + m_idx] = c_vals.x;
C[(n_base + 1) * M + m_idx] = c_vals.y;
C[(n_base + 2) * M + m_idx] = c_vals.z;
C[(n_base + 3) * M + m_idx] = c_vals.w;
} else {
if (n_base < N) C[n_base * M + m_idx] = c_vals.x;
if (n_base + 1 < N) C[(n_base + 1) * M + m_idx] = c_vals.y;
if (n_base + 2 < N) C[(n_base + 2) * M + m_idx] = c_vals.z;
}
}
29 changes: 29 additions & 0 deletions ggml/src/ggml-opencl/kernels/pack_a_for_image.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable

__kernel void pack_a_for_image(
__global const half* src_a,
const ulong a_offset,
__write_only image2d_t dest_img,
const int M,
const int K
) {
const int k_4_idx = get_global_id(0);
const int m_idx = get_global_id(1);

const int k_base = k_4_idx << 2;

if (k_base >= K || m_idx >= M) {
return;
}

__global const half* a_ptr = (__global const half*)((__global const char*)src_a + a_offset);
const int a_idx_base = m_idx * K + k_base;

half4 vals;
vals.x = a_ptr[a_idx_base];
vals.y = (k_base + 1 < K) ? a_ptr[a_idx_base + 1] : (half)0.0h;
vals.z = (k_base + 2 < K) ? a_ptr[a_idx_base + 2] : (half)0.0h;
vals.w = (k_base + 3 < K) ? a_ptr[a_idx_base + 3] : (half)0.0h;

write_imageh(dest_img, (int2)(k_4_idx, m_idx), vals);
}
28 changes: 28 additions & 0 deletions ggml/src/ggml-opencl/kernels/pack_b_for_image.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable

__kernel void pack_b_for_image(
__global const float* src_b,
const ulong b_offset,
__write_only image2d_t dest_img,
const int K,
const int N
) {
const int n_4_idx = get_global_id(0);
const int k_idx = get_global_id(1);

const int n_base = n_4_idx << 2;

if (n_base >= N || k_idx >= K) {
return;
}

__global const float* b_ptr = (__global const float*)((__global const char*)src_b + b_offset);

half4 vals;
vals.x = convert_half(b_ptr[n_base * K + k_idx]);
vals.y = (n_base + 1 < N) ? convert_half(b_ptr[(n_base + 1) * K + k_idx]) : (half)0.0h;
vals.z = (n_base + 2 < N) ? convert_half(b_ptr[(n_base + 2) * K + k_idx]) : (half)0.0h;
vals.w = (n_base + 3 < N) ? convert_half(b_ptr[(n_base + 3) * K + k_idx]) : (half)0.0h;

write_imageh(dest_img, (int2)(n_4_idx, k_idx), vals);
}
Loading