From f5876f2d5c8a5b953d1744fc44af374acf98f436 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Sun, 22 Jun 2025 02:57:20 +0800 Subject: [PATCH 1/8] Conv2D: Add CPU version --- ggml/include/ggml.h | 12 +++ ggml/src/ggml-cpu/ggml-cpu.c | 5 ++ ggml/src/ggml-cpu/ops.cpp | 157 +++++++++++++++++++++++++++++++++++ ggml/src/ggml-cpu/ops.h | 1 + ggml/src/ggml.c | 39 +++++++++ 5 files changed, 214 insertions(+) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index e5dda969a38fe..8bd3ca929285c 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -482,6 +482,7 @@ extern "C" { GGML_OP_CONV_TRANSPOSE_1D, GGML_OP_IM2COL, GGML_OP_IM2COL_BACK, + GGML_OP_CONV_2D, GGML_OP_CONV_2D_DW, GGML_OP_CONV_TRANSPOSE_2D, GGML_OP_POOL_1D, @@ -1813,6 +1814,17 @@ extern "C" { struct ggml_tensor * b, int stride); + GGML_API struct ggml_tensor * ggml_conv_2d_direct( + struct ggml_context * ctx, + struct ggml_tensor * a, // convolution kernel [KW, KH, IC, OC] + struct ggml_tensor * b, // input data [W, H, C, N] + int s0, // stride dimension 0 + int s1, // stride dimension 1 + int p0, // padding dimension 0 + int p1, // padding dimension 1 + int d0, // dilation dimension 0 + int d1); // dilation dimension 1 + enum ggml_op_pool { GGML_OP_POOL_MAX, GGML_OP_POOL_AVG, diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 1d68cde71a65e..75d830f62b7e1 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -1866,6 +1866,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_im2col_back_f32(params, tensor); } break; + case GGML_OP_CONV_2D: + { + ggml_compute_forward_conv_2d(params, tensor); + } break; case GGML_OP_CONV_2D_DW: { ggml_compute_forward_conv_2d_dw(params, tensor); @@ -2228,6 +2232,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { } break; case GGML_OP_IM2COL: case GGML_OP_IM2COL_BACK: + case GGML_OP_CONV_2D: case GGML_OP_CONV_2D_DW: case GGML_OP_CONV_TRANSPOSE_1D: case GGML_OP_CONV_TRANSPOSE_2D: diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 27586ed1fdb2c..fdc7990d9d153 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -6545,6 +6545,163 @@ void ggml_compute_forward_im2col_back_f32( } } +// ggml_compute_forward_conv_2d + +static void ggml_compute_forward_conv_2d_f32( + const ggml_compute_params * params, + const ggml_tensor * kernel, // [KW, KH, IC, OC] + const ggml_tensor * src, // [W, H, C, N] + ggml_tensor * dst) { // [OW, OH, OC, N] + + const int32_t s0 = ggml_get_op_params_i32(dst, 0); + const int32_t s1 = ggml_get_op_params_i32(dst, 1); + const int32_t p0 = ggml_get_op_params_i32(dst, 2); + const int32_t p1 = ggml_get_op_params_i32(dst, 3); + const int32_t d0 = ggml_get_op_params_i32(dst, 4); + const int32_t d1 = ggml_get_op_params_i32(dst, 5); + + const int64_t OW = dst->ne[0]; + const int64_t OH = dst->ne[1]; + const int64_t OC = dst->ne[2]; + const int64_t N = dst->ne[3]; + + const int64_t IW = src->ne[0]; + const int64_t IH = src->ne[1]; + const int64_t IC = src->ne[2]; + + const int64_t KW = kernel->ne[0]; + const int64_t KH = kernel->ne[1]; + + const float * kernel_data = (const float *)kernel->data; + const float * src_data = (const float *)src->data; + float * dst_data = (float *)dst->data; + + const int64_t rows_total = OH * N; + const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth; + const int64_t row_start = params->ith * rows_per_thread; + const int64_t row_end = MIN(row_start + rows_per_thread, rows_total); + + for (int64_t row = row_start; row < row_end; ++row) { + const int64_t oh = row % OH; + const int64_t n = row / OH; + const float * src_batch = src_data + n * IW * IH * IC; + + for (int64_t ow = 0; ow < OW; ++ow) { + for (int64_t oc = 0; oc < OC; ++oc) { + float sum = 0.0f; + const float * kernel_channel = kernel_data + oc * KW * KH * IC; + + for (int64_t kh = 0; kh < KH; ++kh) { + const int64_t ih = oh * s1 - p1 + kh * d1; + if (ih < 0 || ih >= IH) continue; + + for (int64_t kw = 0; kw < KW; ++kw) { + const int64_t iw = ow * s0 - p0 + kw * d0; + if (iw < 0 || iw >= IW) continue; + + #pragma omp simd + for (int64_t ic = 0; ic < IC; ++ic) { + const float * kernel_ptr = kernel_channel + (kh * KW + kw) + ic * KW * KH; + const float * src_ptr = src_batch + (ih * IW + iw) + ic * IW * IH; + sum += (*kernel_ptr) * (*src_ptr); + } + } + } + + dst_data[((n * OC + oc) * OH + oh) * OW + ow] = sum; + } + } + } +} + +static void ggml_compute_forward_conv_2d_f16( + const ggml_compute_params * params, + const ggml_tensor * kernel, // [KW, KH, IC, OC] + const ggml_tensor * src, // [W, H, C, N] + ggml_tensor * dst) { // [OW, OH, OC, N] + + const int32_t s0 = ggml_get_op_params_i32(dst, 0); + const int32_t s1 = ggml_get_op_params_i32(dst, 1); + const int32_t p0 = ggml_get_op_params_i32(dst, 2); + const int32_t p1 = ggml_get_op_params_i32(dst, 3); + const int32_t d0 = ggml_get_op_params_i32(dst, 4); + const int32_t d1 = ggml_get_op_params_i32(dst, 5); + + const int64_t OW = dst->ne[0]; + const int64_t OH = dst->ne[1]; + const int64_t OC = dst->ne[2]; + const int64_t N = dst->ne[3]; + + const int64_t IW = src->ne[0]; + const int64_t IH = src->ne[1]; + const int64_t IC = src->ne[2]; + + const int64_t KW = kernel->ne[0]; + const int64_t KH = kernel->ne[1]; + + const ggml_fp16_t * kernel_data = (const ggml_fp16_t *)kernel->data; + const ggml_fp16_t * src_data = (const ggml_fp16_t *)src->data; + ggml_fp16_t * dst_data = (ggml_fp16_t *)dst->data; + + const int64_t rows_total = OH * N; + const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth; + const int64_t row_start = params->ith * rows_per_thread; + const int64_t row_end = MIN(row_start + rows_per_thread, rows_total); + + for (int64_t row = row_start; row < row_end; ++row) { + const int64_t oh = row % OH; + const int64_t n = row / OH; + const ggml_fp16_t * src_batch = src_data + n * IW * IH * IC; + + for (int64_t ow = 0; ow < OW; ++ow) { + for (int64_t oc = 0; oc < OC; ++oc) { + float sum = 0.0f; + const ggml_fp16_t * kernel_channel = kernel_data + oc * KW * KH * IC; + for (int64_t kh = 0; kh < KH; ++kh) { + const int64_t ih = oh * s1 - p1 + kh * d1; + if (ih < 0 || ih >= IH) continue; + + for (int64_t kw = 0; kw < KW; ++kw) { + const int64_t iw = ow * s0 - p0 + kw * d0; + if (iw < 0 || iw >= IW) continue; + + for (int64_t ic = 0; ic < IC; ++ic) { + const ggml_fp16_t * kernel_ptr = kernel_channel + (kh * KW + kw) + ic * KW * KH; + const ggml_fp16_t * src_ptr = src_batch + (ih * IW + iw) + ic * IW * IH; + sum += GGML_FP16_TO_FP32(*kernel_ptr) * GGML_FP16_TO_FP32(*src_ptr); + } + } + } + + dst_data[((n * OC + oc) * OH + oh) * OW + ow] = GGML_FP32_TO_FP16(sum); + } + } + } +} + +void ggml_compute_forward_conv_2d( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_conv_2d_f16(params, src0, src1, dst); + } break; + case GGML_TYPE_F32: + { + ggml_compute_forward_conv_2d_f32(params, src0, src1, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + // ggml_compute_forward_conv_transpose_2d void ggml_compute_forward_conv_transpose_2d( diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h index 5b384e4ba5fce..8d19fc925c2a7 100644 --- a/ggml/src/ggml-cpu/ops.h +++ b/ggml/src/ggml-cpu/ops.h @@ -65,6 +65,7 @@ void ggml_compute_forward_clamp(const struct ggml_compute_params * params, struc void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 14000b55aca1e..172a060490f6c 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1044,6 +1044,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "conv_transpose_1d(x)", "im2col(x)", "im2col_back(x)", + "conv_2d(x)", "conv_2d_dw(x)", "conv_transpose_2d(x)", "pool_1d(x)", @@ -4291,6 +4292,44 @@ struct ggml_tensor * ggml_conv_2d_dw_direct( return result; } +// ggml_conv_2d_direct + +struct ggml_tensor * ggml_conv_2d_direct( + struct ggml_context * ctx, + struct ggml_tensor * a, // convolution kernel [KW, KH, IC, OC] + struct ggml_tensor * b, // input data [W, H, C, N] + int s0, // stride dimension 0 + int s1, // stride dimension 1 + int p0, // padding dimension 0 + int p1, // padding dimension 1 + int d0, // dilation dimension 0 + int d1) {// dilation dimension 1 + + GGML_ASSERT(a->ne[2] == b->ne[2]); + GGML_ASSERT(a->type == b->type); + + int64_t ne[4]; + ne[0] = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0); + ne[1] = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1); + ne[2] = a->ne[3]; + ne[3] = b->ne[3]; + + struct ggml_tensor * result = ggml_new_tensor(ctx, b->type, 4, ne); + + ggml_set_op_params_i32(result, 0, s0); + ggml_set_op_params_i32(result, 1, s1); + ggml_set_op_params_i32(result, 2, p0); + ggml_set_op_params_i32(result, 3, p1); + ggml_set_op_params_i32(result, 4, d0); + ggml_set_op_params_i32(result, 5, d1); + + result->op = GGML_OP_CONV_2D; + result->src[0] = a; + result->src[1] = b; + + return result; +} + // ggml_conv_transpose_2d_p0 static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) { From 48b7fa2fe669aefec685956e63544df4dd1823e4 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Thu, 26 Jun 2025 14:53:10 +0800 Subject: [PATCH 2/8] Half decent --- ggml/src/ggml-cpu/ggml-cpu.c | 12 ++- ggml/src/ggml-cpu/ops.cpp | 200 ++++++++++++++++++++++++++--------- ggml/src/ggml-cpu/ops.h | 4 + ggml/src/ggml.c | 2 +- 4 files changed, 168 insertions(+), 50 deletions(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 75d830f62b7e1..edf8222843220 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -687,6 +687,10 @@ static void ggml_init_arm_arch_features(void) { #endif // __ARM_ARCH +void ggml_compute_forward_mul_mat( + const struct ggml_compute_params * params, + struct ggml_tensor * dst); + struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) { GGML_ASSERT(!ggml_get_no_alloc(ctx)); @@ -1193,7 +1197,7 @@ static void ggml_compute_forward_mul_mat_one_chunk( } } -static void ggml_compute_forward_mul_mat( +void ggml_compute_forward_mul_mat( const struct ggml_compute_params * params, struct ggml_tensor * dst) { @@ -2751,6 +2755,12 @@ struct ggml_cplan ggml_graph_plan( GGML_ABORT("fatal error"); } } break; + case GGML_OP_CONV_2D: + { + cur = GGML_IM2COL_WORK_SIZE; + //Add enough space for kernel transpose + cur += sizeof(ggml_fp16_t)*node->src[1]->ne[0]*node->src[1]->ne[1]*node->src[1]->ne[2]*node->src[1]->ne[3]; + } break; case GGML_OP_CONV_TRANSPOSE_2D: { const int64_t ne00 = node->src[0]->ne[0]; // W diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index fdc7990d9d153..0d82690fc9703 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -3,6 +3,7 @@ #include "ggml-cpu.h" #include "ggml-impl.h" #include "binary-ops.h" +#include "ggml.h" #include "unary-ops.h" #include "vec.h" @@ -6545,70 +6546,173 @@ void ggml_compute_forward_im2col_back_f32( } } +static void ggml_call_mul_mat( + const ggml_compute_params * params, + int64_t m, int64_t n, int64_t k, + void * a, void * b, void * c) { + + struct ggml_tensor src1 = {}; + src1.ne[0] = k; + src1.ne[1] = m; + src1.ne[2] = 1; + src1.ne[3] = 1; + src1.nb[0] = sizeof(float); + src1.nb[1] = k * sizeof(float); + src1.nb[2] = src1.nb[1]; + src1.nb[3] = src1.nb[2]; + src1.data = a; + + struct ggml_tensor src0 = {}; + src0.ne[0] = k; + src0.ne[1] = n; + src0.ne[2] = 1; + src0.ne[3] = 1; + src0.nb[0] = sizeof(float); + src0.nb[1] = k * sizeof(float); + src0.nb[2] = src0.nb[1]; + src0.nb[3] = src0.nb[2]; + src0.data = b; + + struct ggml_tensor dst = {}; + dst.ne[0] = n; + dst.ne[1] = m; + dst.ne[2] = 1; + dst.ne[3] = 1; + dst.nb[0] = sizeof(float); + dst.nb[1] = n * sizeof(float); + dst.nb[2] = dst.nb[1]; + dst.nb[3] = dst.nb[2]; + dst.data = c; + dst.src[0] = &src0; + dst.src[1] = &src1; + + ggml_compute_forward_mul_mat(params, &dst); +} + + // ggml_compute_forward_conv_2d -static void ggml_compute_forward_conv_2d_f32( - const ggml_compute_params * params, - const ggml_tensor * kernel, // [KW, KH, IC, OC] - const ggml_tensor * src, // [W, H, C, N] - ggml_tensor * dst) { // [OW, OH, OC, N] +static void ggml_compute_forward_conv_2d_f32(const ggml_compute_params * params, + ggml_tensor * dst) { - const int32_t s0 = ggml_get_op_params_i32(dst, 0); - const int32_t s1 = ggml_get_op_params_i32(dst, 1); - const int32_t p0 = ggml_get_op_params_i32(dst, 2); - const int32_t p1 = ggml_get_op_params_i32(dst, 3); - const int32_t d0 = ggml_get_op_params_i32(dst, 4); - const int32_t d1 = ggml_get_op_params_i32(dst, 5); + const ggml_tensor * src = dst->src[1]; // [W H C_in N] + const ggml_tensor * kernel = dst->src[0]; // [W H C_in C_out] - const int64_t OW = dst->ne[0]; - const int64_t OH = dst->ne[1]; - const int64_t OC = dst->ne[2]; - const int64_t N = dst->ne[3]; + GGML_ASSERT(ggml_is_contiguous(kernel)); - const int64_t IW = src->ne[0]; - const int64_t IH = src->ne[1]; - const int64_t IC = src->ne[2]; + const int32_t stride_x = dst->op_params[0]; + const int32_t stride_y = dst->op_params[1]; + const int32_t pad_x = dst->op_params[2]; + const int32_t pad_y = dst->op_params[3]; - const int64_t KW = kernel->ne[0]; - const int64_t KH = kernel->ne[1]; + const int64_t c_in = src->ne[2]; + const int64_t c_out = kernel->ne[3]; + GGML_ASSERT(c_in == kernel->ne[2]); - const float * kernel_data = (const float *)kernel->data; - const float * src_data = (const float *)src->data; - float * dst_data = (float *)dst->data; + const int64_t src_w = src->ne[0]; + const int64_t src_h = src->ne[1]; + const int64_t knl_w = kernel->ne[0]; + const int64_t knl_h = kernel->ne[1]; + const int64_t dst_w = dst->ne[0]; + const int64_t dst_h = dst->ne[1]; - const int64_t rows_total = OH * N; - const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth; - const int64_t row_start = params->ith * rows_per_thread; - const int64_t row_end = MIN(row_start + rows_per_thread, rows_total); - for (int64_t row = row_start; row < row_end; ++row) { - const int64_t oh = row % OH; - const int64_t n = row / OH; - const float * src_batch = src_data + n * IW * IH * IC; + float * src_data = (float *) src->data; + float * knl_data = (float *) kernel->data; + float * dst_data = ( float *) dst->data; - for (int64_t ow = 0; ow < OW; ++ow) { - for (int64_t oc = 0; oc < OC; ++oc) { - float sum = 0.0f; - const float * kernel_channel = kernel_data + oc * KW * KH * IC; - for (int64_t kh = 0; kh < KH; ++kh) { - const int64_t ih = oh * s1 - p1 + kh * d1; - if (ih < 0 || ih >= IH) continue; + const int64_t knl_n = knl_w * knl_h * c_in; + const int64_t patch_total = dst->ne[3] * dst_w * dst_h; + - for (int64_t kw = 0; kw < KW; ++kw) { - const int64_t iw = ow * s0 - p0 + kw * d0; - if (iw < 0 || iw >= IW) continue; + + const int64_t space_per_patch = knl_n * sizeof(float) + patch_total * c_out * sizeof(float); - #pragma omp simd - for (int64_t ic = 0; ic < IC; ++ic) { - const float * kernel_ptr = kernel_channel + (kh * KW + kw) + ic * KW * KH; - const float * src_ptr = src_batch + (ih * IW + iw) + ic * IW * IH; - sum += (*kernel_ptr) * (*src_ptr); + const int64_t batch_size = params->wsize / space_per_patch; + const int64_t patches_per_batch = batch_size > 8 ? (batch_size / 8) * 8 : batch_size; + const int64_t batch_n = (patch_total + patches_per_batch - 1) / patches_per_batch; + + + GGML_ASSERT(patches_per_batch > 0 && batch_size >= 1); + + float * tmp = (float *) params->wdata; // per-thread scratch + + for (int64_t batch_i = 0; batch_i < batch_n; ++batch_i) { + + const int64_t patch_start_batch = batch_i * patches_per_batch; + const int64_t patch_end_batch = std::min(patch_start_batch + patches_per_batch, + patch_total); + const int64_t patch_n = patch_end_batch - patch_start_batch; + + const int64_t patch_per_thread = + (patch_n + params->nth - 1) / params->nth; + const int64_t patch_start = patch_start_batch + + params->ith * patch_per_thread; + const int64_t patch_end = std::min(patch_start + patch_per_thread, + patch_end_batch); + + //im2col for a patch + for (int64_t p = patch_start; p < patch_end; ++p) { + const int64_t b = p / (dst_w * dst_h); + const int64_t dy = (p / dst_w) % dst_h; + const int64_t dx = p % dst_w; + + const float * src_base = (const float *)((char *)src_data + b * src->nb[3]); + float * out_row = tmp + (p % patches_per_batch) * knl_n; + + // Extract patch in IC,KH,KW order (same as im2col) + for (int64_t ic = 0; ic < c_in; ++ic) { + for (int64_t ky = 0; ky < knl_h; ++ky) { + for (int64_t kx = 0; kx < knl_w; ++kx) { + const int64_t sy = dy * stride_y + ky - pad_y; + const int64_t sx = dx * stride_x + kx - pad_x; + + int64_t dst_idx = ic * (knl_h * knl_w) + ky * knl_w + kx; + + if (sy < 0 || sy >= src_h || sx < 0 || sx >= src_w) { + out_row[dst_idx] = 0.0f; + } else { + float * src_ptr = (float *)((char *)src_base + + sx * src->nb[0] + sy * src->nb[1] + ic * src->nb[2]); + out_row[dst_idx] = *src_ptr; } } } + } + } // patches handled by this thread + + ggml_barrier(params->threadpool); // wait for all threads - dst_data[((n * OC + oc) * OH + oh) * OW + ow] = sum; + //GEMM output is patch_n * cout + float * gemm_output = tmp + patches_per_batch * knl_n; + + // GEMM: patches[patch_n, knl_n] × kernel[knl_n, c_out] = output[patch_n, c_out] + ggml_call_mul_mat(params, patch_n, c_out, knl_n, + tmp, knl_data, gemm_output); + + // Barrier to ensure GEMM completes before permutation + ggml_barrier(params->threadpool); + + // Distribute permutation work across threads + const int64_t permute_per_thread = (patch_n + params->nth - 1) / params->nth; + const int64_t permute_start = params->ith * permute_per_thread; + const int64_t permute_end = std::min(permute_start + permute_per_thread, patch_n); + + // Each thread handles part of the permutation from [patch_n, c_out] to WHCN layout + for (int64_t i = permute_start; i < permute_end; ++i) { + const int64_t p = patch_start_batch + i; + const int64_t b = p / (dst_w * dst_h); // batch index + const int64_t dy = (p / dst_w) % dst_h; // height index + const int64_t dx = p % dst_w; // width index + + // Copy all channels for this spatial position + for (int64_t oc = 0; oc < c_out; ++oc) { + const float value = gemm_output[i * c_out + oc]; + // Write to WHCN layout: dst[w, h, c, n] + float * dst_ptr = (float *)((char *)dst_data + + dx * dst->nb[0] + dy * dst->nb[1] + oc * dst->nb[2] + b * dst->nb[3]); + *dst_ptr = value; } } } @@ -6693,7 +6797,7 @@ void ggml_compute_forward_conv_2d( } break; case GGML_TYPE_F32: { - ggml_compute_forward_conv_2d_f32(params, src0, src1, dst); + ggml_compute_forward_conv_2d_f32(params, dst); } break; default: { diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h index 8d19fc925c2a7..417f49074f4ed 100644 --- a/ggml/src/ggml-cpu/ops.h +++ b/ggml/src/ggml-cpu/ops.h @@ -20,6 +20,9 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); +// Work buffer size for im2col operations in CONV2D +#define GGML_IM2COL_WORK_SIZE (16 * 1024 * 1024) // 16MB work buffer + #ifdef __cplusplus extern "C" { #endif @@ -108,6 +111,7 @@ void ggml_compute_forward_custom(const struct ggml_compute_params * params, stru void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst); #ifdef __cplusplus } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 172a060490f6c..e4bdd6efa6aad 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -4306,7 +4306,7 @@ struct ggml_tensor * ggml_conv_2d_direct( int d1) {// dilation dimension 1 GGML_ASSERT(a->ne[2] == b->ne[2]); - GGML_ASSERT(a->type == b->type); + //GGML_ASSERT(a->type == b->type); int64_t ne[4]; ne[0] = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0); From a2c0311038dd4e5a4785c680283463a37631dd9c Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Thu, 26 Jun 2025 17:50:25 +0800 Subject: [PATCH 3/8] Tiled approach for F32 --- ggml/src/ggml-cpu/ops.cpp | 190 +++++++++++--------------------------- tests/CMakeLists.txt | 1 + 2 files changed, 53 insertions(+), 138 deletions(-) diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 0d82690fc9703..31310914a6799 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -6592,18 +6592,21 @@ static void ggml_call_mul_mat( // ggml_compute_forward_conv_2d -static void ggml_compute_forward_conv_2d_f32(const ggml_compute_params * params, - ggml_tensor * dst) { - - const ggml_tensor * src = dst->src[1]; // [W H C_in N] - const ggml_tensor * kernel = dst->src[0]; // [W H C_in C_out] +static void ggml_compute_forward_conv_2d_f32( + const ggml_compute_params * params, + const ggml_tensor * kernel, // [KW, KH, IC, OC] - fp32 + const ggml_tensor * src, // [W, H, C, N] + ggml_tensor * dst) { // [OW, OH, OC, N] GGML_ASSERT(ggml_is_contiguous(kernel)); + GGML_ASSERT(kernel->type == GGML_TYPE_F32); - const int32_t stride_x = dst->op_params[0]; - const int32_t stride_y = dst->op_params[1]; - const int32_t pad_x = dst->op_params[2]; - const int32_t pad_y = dst->op_params[3]; + const int32_t stride_x = dst->op_params[0]; + const int32_t stride_y = dst->op_params[1]; + const int32_t pad_x = dst->op_params[2]; + const int32_t pad_y = dst->op_params[3]; + const int32_t dilation_x = dst->op_params[4]; + const int32_t dilation_y = dst->op_params[5]; const int64_t c_in = src->ne[2]; const int64_t c_out = kernel->ne[3]; @@ -6616,173 +6619,93 @@ static void ggml_compute_forward_conv_2d_f32(const ggml_compute_params * params, const int64_t dst_w = dst->ne[0]; const int64_t dst_h = dst->ne[1]; - - float * src_data = (float *) src->data; - float * knl_data = (float *) kernel->data; - float * dst_data = ( float *) dst->data; - + float * src_data = (float*) src->data; + float * knl_data = (float*) kernel->data; + float * dst_data = (float*) dst->data; const int64_t knl_n = knl_w * knl_h * c_in; const int64_t patch_total = dst->ne[3] * dst_w * dst_h; - - - - const int64_t space_per_patch = knl_n * sizeof(float) + patch_total * c_out * sizeof(float); - const int64_t batch_size = params->wsize / space_per_patch; + const int64_t space_per_patch = knl_n * sizeof(float) + c_out * sizeof(float); + const int64_t batch_size = params->wsize / space_per_patch; const int64_t patches_per_batch = batch_size > 8 ? (batch_size / 8) * 8 : batch_size; - const int64_t batch_n = (patch_total + patches_per_batch - 1) / patches_per_batch; - + const int64_t batch_n = (patch_total + patches_per_batch - 1) / patches_per_batch; GGML_ASSERT(patches_per_batch > 0 && batch_size >= 1); - float * tmp = (float *) params->wdata; // per-thread scratch + float * tmp = (float *) params->wdata; for (int64_t batch_i = 0; batch_i < batch_n; ++batch_i) { const int64_t patch_start_batch = batch_i * patches_per_batch; const int64_t patch_end_batch = std::min(patch_start_batch + patches_per_batch, patch_total); - const int64_t patch_n = patch_end_batch - patch_start_batch; + const int64_t patch_n = patch_end_batch - patch_start_batch; - const int64_t patch_per_thread = - (patch_n + params->nth - 1) / params->nth; - const int64_t patch_start = patch_start_batch + - params->ith * patch_per_thread; - const int64_t patch_end = std::min(patch_start + patch_per_thread, - patch_end_batch); + const int64_t patch_per_thread = (patch_n + params->nth - 1) / params->nth; + const int64_t patch_start = patch_start_batch + params->ith * patch_per_thread; + const int64_t patch_end = std::min(patch_start + patch_per_thread,patch_end_batch); //im2col for a patch for (int64_t p = patch_start; p < patch_end; ++p) { - const int64_t b = p / (dst_w * dst_h); - const int64_t dy = (p / dst_w) % dst_h; - const int64_t dx = p % dst_w; + const int64_t batch_n = p / (dst_w * dst_h); + const int64_t src_x = (p / dst_w) % dst_h; + const int64_t src_y = p % dst_w; - const float * src_base = (const float *)((char *)src_data + b * src->nb[3]); - float * out_row = tmp + (p % patches_per_batch) * knl_n; + float * src_base = (float *)((char *)src_data + batch_n * src->nb[3]); + float * dst_row = tmp + (p % patches_per_batch) * knl_n; - // Extract patch in IC,KH,KW order (same as im2col) for (int64_t ic = 0; ic < c_in; ++ic) { for (int64_t ky = 0; ky < knl_h; ++ky) { for (int64_t kx = 0; kx < knl_w; ++kx) { - const int64_t sy = dy * stride_y + ky - pad_y; - const int64_t sx = dx * stride_x + kx - pad_x; - + const int64_t sy = src_x * stride_y + ky * dilation_y - pad_y; + const int64_t sx = src_y * stride_x + kx * dilation_x - pad_x; + int64_t dst_idx = ic * (knl_h * knl_w) + ky * knl_w + kx; - + if (sy < 0 || sy >= src_h || sx < 0 || sx >= src_w) { - out_row[dst_idx] = 0.0f; + dst_row[dst_idx] = 0.0f; } else { - float * src_ptr = (float *)((char *)src_base + - sx * src->nb[0] + sy * src->nb[1] + ic * src->nb[2]); - out_row[dst_idx] = *src_ptr; + float * src_ptr = (float *)((char *)src_base + sx * src->nb[0] + sy * src->nb[1] + ic * src->nb[2]); + dst_row[dst_idx] = *src_ptr; } } } } } // patches handled by this thread - ggml_barrier(params->threadpool); // wait for all threads + ggml_barrier(params->threadpool); - //GEMM output is patch_n * cout float * gemm_output = tmp + patches_per_batch * knl_n; - + // GEMM: patches[patch_n, knl_n] × kernel[knl_n, c_out] = output[patch_n, c_out] ggml_call_mul_mat(params, patch_n, c_out, knl_n, tmp, knl_data, gemm_output); - - // Barrier to ensure GEMM completes before permutation + ggml_barrier(params->threadpool); - - // Distribute permutation work across threads + + + //permute back [OC, N, OH, OW] to [N, OC, OH, OW] const int64_t permute_per_thread = (patch_n + params->nth - 1) / params->nth; const int64_t permute_start = params->ith * permute_per_thread; const int64_t permute_end = std::min(permute_start + permute_per_thread, patch_n); - - // Each thread handles part of the permutation from [patch_n, c_out] to WHCN layout + for (int64_t i = permute_start; i < permute_end; ++i) { - const int64_t p = patch_start_batch + i; - const int64_t b = p / (dst_w * dst_h); // batch index - const int64_t dy = (p / dst_w) % dst_h; // height index - const int64_t dx = p % dst_w; // width index - - // Copy all channels for this spatial position + const int64_t p = patch_start_batch + i; + const int64_t batch_n = p / (dst_w * dst_h); + const int64_t dst_y = (p / dst_w) % dst_h; + const int64_t dst_x = p % dst_w; + for (int64_t oc = 0; oc < c_out; ++oc) { const float value = gemm_output[i * c_out + oc]; // Write to WHCN layout: dst[w, h, c, n] - float * dst_ptr = (float *)((char *)dst_data + - dx * dst->nb[0] + dy * dst->nb[1] + oc * dst->nb[2] + b * dst->nb[3]); + float * dst_ptr = (float *)((char *)dst_data + dst_x * dst->nb[0] + dst_y * dst->nb[1] + oc * dst->nb[2] + batch_n * dst->nb[3]); *dst_ptr = value; } } } } -static void ggml_compute_forward_conv_2d_f16( - const ggml_compute_params * params, - const ggml_tensor * kernel, // [KW, KH, IC, OC] - const ggml_tensor * src, // [W, H, C, N] - ggml_tensor * dst) { // [OW, OH, OC, N] - - const int32_t s0 = ggml_get_op_params_i32(dst, 0); - const int32_t s1 = ggml_get_op_params_i32(dst, 1); - const int32_t p0 = ggml_get_op_params_i32(dst, 2); - const int32_t p1 = ggml_get_op_params_i32(dst, 3); - const int32_t d0 = ggml_get_op_params_i32(dst, 4); - const int32_t d1 = ggml_get_op_params_i32(dst, 5); - - const int64_t OW = dst->ne[0]; - const int64_t OH = dst->ne[1]; - const int64_t OC = dst->ne[2]; - const int64_t N = dst->ne[3]; - - const int64_t IW = src->ne[0]; - const int64_t IH = src->ne[1]; - const int64_t IC = src->ne[2]; - - const int64_t KW = kernel->ne[0]; - const int64_t KH = kernel->ne[1]; - - const ggml_fp16_t * kernel_data = (const ggml_fp16_t *)kernel->data; - const ggml_fp16_t * src_data = (const ggml_fp16_t *)src->data; - ggml_fp16_t * dst_data = (ggml_fp16_t *)dst->data; - - const int64_t rows_total = OH * N; - const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth; - const int64_t row_start = params->ith * rows_per_thread; - const int64_t row_end = MIN(row_start + rows_per_thread, rows_total); - - for (int64_t row = row_start; row < row_end; ++row) { - const int64_t oh = row % OH; - const int64_t n = row / OH; - const ggml_fp16_t * src_batch = src_data + n * IW * IH * IC; - - for (int64_t ow = 0; ow < OW; ++ow) { - for (int64_t oc = 0; oc < OC; ++oc) { - float sum = 0.0f; - const ggml_fp16_t * kernel_channel = kernel_data + oc * KW * KH * IC; - for (int64_t kh = 0; kh < KH; ++kh) { - const int64_t ih = oh * s1 - p1 + kh * d1; - if (ih < 0 || ih >= IH) continue; - - for (int64_t kw = 0; kw < KW; ++kw) { - const int64_t iw = ow * s0 - p0 + kw * d0; - if (iw < 0 || iw >= IW) continue; - - for (int64_t ic = 0; ic < IC; ++ic) { - const ggml_fp16_t * kernel_ptr = kernel_channel + (kh * KW + kw) + ic * KW * KH; - const ggml_fp16_t * src_ptr = src_batch + (ih * IW + iw) + ic * IW * IH; - sum += GGML_FP16_TO_FP32(*kernel_ptr) * GGML_FP16_TO_FP32(*src_ptr); - } - } - } - - dst_data[((n * OC + oc) * OH + oh) * OW + ow] = GGML_FP32_TO_FP16(sum); - } - } - } -} - void ggml_compute_forward_conv_2d( const ggml_compute_params * params, ggml_tensor * dst) { @@ -6790,19 +6713,10 @@ void ggml_compute_forward_conv_2d( const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; - switch (src0->type) { - case GGML_TYPE_F16: - { - ggml_compute_forward_conv_2d_f16(params, src0, src1, dst); - } break; - case GGML_TYPE_F32: - { - ggml_compute_forward_conv_2d_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } + if (src0->type == GGML_TYPE_F16) { + GGML_ASSERT(false && "F16 not supported yet"); + } else { + ggml_compute_forward_conv_2d_f32(params, src0, src1, dst); } } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index fc1557a2d4065..517cc7a945e25 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -195,6 +195,7 @@ endif() # llama_build_and_test(test-opt.cpp) # SLOW llama_build_and_test(test-gguf.cpp) llama_build_and_test(test-backend-ops.cpp) +llama_build_and_test(test_conv2d_comparison.cpp) llama_build_and_test(test-model-load-cancel.cpp LABEL "model") llama_build_and_test(test-autorelease.cpp LABEL "model") From 4b1e71a98864f29b3cc3b87d99cf847efc6e8374 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Thu, 26 Jun 2025 17:54:12 +0800 Subject: [PATCH 4/8] remove file --- ggml/src/ggml-cpu/ggml-cpu.c | 2 -- tests/CMakeLists.txt | 1 - 2 files changed, 3 deletions(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index edf8222843220..219e87770b65e 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -2758,8 +2758,6 @@ struct ggml_cplan ggml_graph_plan( case GGML_OP_CONV_2D: { cur = GGML_IM2COL_WORK_SIZE; - //Add enough space for kernel transpose - cur += sizeof(ggml_fp16_t)*node->src[1]->ne[0]*node->src[1]->ne[1]*node->src[1]->ne[2]*node->src[1]->ne[3]; } break; case GGML_OP_CONV_TRANSPOSE_2D: { diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 517cc7a945e25..fc1557a2d4065 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -195,7 +195,6 @@ endif() # llama_build_and_test(test-opt.cpp) # SLOW llama_build_and_test(test-gguf.cpp) llama_build_and_test(test-backend-ops.cpp) -llama_build_and_test(test_conv2d_comparison.cpp) llama_build_and_test(test-model-load-cancel.cpp LABEL "model") llama_build_and_test(test-autorelease.cpp LABEL "model") From aed4e1ffab02de6a831af422da6d66f63f3b24cb Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Thu, 26 Jun 2025 22:59:58 +0800 Subject: [PATCH 5/8] Fix tests --- ggml/src/ggml-cpu/ggml-cpu.c | 4 ---- ggml/src/ggml-cpu/ops.h | 2 +- ggml/src/ggml.c | 1 + 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 219e87770b65e..11ff228f07a44 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -687,10 +687,6 @@ static void ggml_init_arm_arch_features(void) { #endif // __ARM_ARCH -void ggml_compute_forward_mul_mat( - const struct ggml_compute_params * params, - struct ggml_tensor * dst); - struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) { GGML_ASSERT(!ggml_get_no_alloc(ctx)); diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h index 417f49074f4ed..3a32ec20dba2b 100644 --- a/ggml/src/ggml-cpu/ops.h +++ b/ggml/src/ggml-cpu/ops.h @@ -21,7 +21,7 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); // Work buffer size for im2col operations in CONV2D -#define GGML_IM2COL_WORK_SIZE (16 * 1024 * 1024) // 16MB work buffer +#define GGML_IM2COL_WORK_SIZE (16 * 1024 * 1024) #ifdef __cplusplus extern "C" { diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index e4bdd6efa6aad..7d407315fea78 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -945,6 +945,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CONV_TRANSPOSE_1D", "IM2COL", "IM2COL_BACK", + "CONV_2D", "CONV_2D_DW", "CONV_TRANSPOSE_2D", "POOL_1D", From 966aa76c1c24afbfd308a04265d8a75b2e894912 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Sat, 28 Jun 2025 17:10:23 +0800 Subject: [PATCH 6/8] Support F16 operations --- ggml/src/ggml-cpu/ops.cpp | 65 ++++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 31 deletions(-) diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 31310914a6799..5839945b405a3 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -6546,29 +6546,29 @@ void ggml_compute_forward_im2col_back_f32( } } -static void ggml_call_mul_mat( - const ggml_compute_params * params, - int64_t m, int64_t n, int64_t k, - void * a, void * b, void * c) { - +static void ggml_call_mul_mat(ggml_type T, const ggml_compute_params * params, int64_t m, int64_t n, int64_t k, + void * a, void * b, void * c) { + const ggml_type_traits * traits = ggml_get_type_traits(T); struct ggml_tensor src1 = {}; + src1.type = T; src1.ne[0] = k; src1.ne[1] = m; src1.ne[2] = 1; src1.ne[3] = 1; - src1.nb[0] = sizeof(float); - src1.nb[1] = k * sizeof(float); + src1.nb[0] = traits->type_size; + src1.nb[1] = k * traits->type_size; src1.nb[2] = src1.nb[1]; src1.nb[3] = src1.nb[2]; src1.data = a; struct ggml_tensor src0 = {}; + src0.type = T; src0.ne[0] = k; src0.ne[1] = n; src0.ne[2] = 1; src0.ne[3] = 1; - src0.nb[0] = sizeof(float); - src0.nb[1] = k * sizeof(float); + src0.nb[0] = traits->type_size; + src0.nb[1] = k * traits->type_size; src0.nb[2] = src0.nb[1]; src0.nb[3] = src0.nb[2]; src0.data = b; @@ -6589,17 +6589,18 @@ static void ggml_call_mul_mat( ggml_compute_forward_mul_mat(params, &dst); } - // ggml_compute_forward_conv_2d -static void ggml_compute_forward_conv_2d_f32( - const ggml_compute_params * params, - const ggml_tensor * kernel, // [KW, KH, IC, OC] - fp32 - const ggml_tensor * src, // [W, H, C, N] - ggml_tensor * dst) { // [OW, OH, OC, N] +static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params, + const ggml_tensor * kernel, // [KW, KH, IC, OC] + const ggml_tensor * src, // [W, H, C, N] + ggml_tensor * dst, // [OW, OH, OC, N] + ggml_type kernel_type) { GGML_ASSERT(ggml_is_contiguous(kernel)); - GGML_ASSERT(kernel->type == GGML_TYPE_F32); + GGML_ASSERT(kernel->type == kernel_type); + + const ggml_type_traits * traits = ggml_get_type_traits(kernel_type); const int32_t stride_x = dst->op_params[0]; const int32_t stride_y = dst->op_params[1]; @@ -6620,20 +6621,20 @@ static void ggml_compute_forward_conv_2d_f32( const int64_t dst_h = dst->ne[1]; float * src_data = (float*) src->data; - float * knl_data = (float*) kernel->data; + void * knl_data = kernel->data; float * dst_data = (float*) dst->data; const int64_t knl_n = knl_w * knl_h * c_in; const int64_t patch_total = dst->ne[3] * dst_w * dst_h; - const int64_t space_per_patch = knl_n * sizeof(float) + c_out * sizeof(float); + const int64_t space_per_patch = knl_n * traits->type_size + c_out * sizeof(float); const int64_t batch_size = params->wsize / space_per_patch; const int64_t patches_per_batch = batch_size > 8 ? (batch_size / 8) * 8 : batch_size; const int64_t batch_n = (patch_total + patches_per_batch - 1) / patches_per_batch; GGML_ASSERT(patches_per_batch > 0 && batch_size >= 1); - float * tmp = (float *) params->wdata; + void * tmp = params->wdata; for (int64_t batch_i = 0; batch_i < batch_n; ++batch_i) { @@ -6653,7 +6654,7 @@ static void ggml_compute_forward_conv_2d_f32( const int64_t src_y = p % dst_w; float * src_base = (float *)((char *)src_data + batch_n * src->nb[3]); - float * dst_row = tmp + (p % patches_per_batch) * knl_n; + char * dst_row = (char *) tmp + (p % patches_per_batch) * knl_n * traits->type_size; for (int64_t ic = 0; ic < c_in; ++ic) { for (int64_t ky = 0; ky < knl_h; ++ky) { @@ -6663,11 +6664,19 @@ static void ggml_compute_forward_conv_2d_f32( int64_t dst_idx = ic * (knl_h * knl_w) + ky * knl_w + kx; + float src_val; if (sy < 0 || sy >= src_h || sx < 0 || sx >= src_w) { - dst_row[dst_idx] = 0.0f; + src_val = 0.0f; } else { float * src_ptr = (float *)((char *)src_base + sx * src->nb[0] + sy * src->nb[1] + ic * src->nb[2]); - dst_row[dst_idx] = *src_ptr; + src_val = *src_ptr; + } + + char * element_ptr = dst_row + dst_idx * traits->type_size; + if (kernel_type == GGML_TYPE_F32) { + *(float *) element_ptr = src_val; + } else if (kernel_type == GGML_TYPE_F16) { + *(ggml_fp16_t *) element_ptr = GGML_FP32_TO_FP16(src_val); } } } @@ -6676,11 +6685,10 @@ static void ggml_compute_forward_conv_2d_f32( ggml_barrier(params->threadpool); - float * gemm_output = tmp + patches_per_batch * knl_n; + float * gemm_output = (float *) ((char *) tmp + patches_per_batch * knl_n * traits->type_size); // GEMM: patches[patch_n, knl_n] × kernel[knl_n, c_out] = output[patch_n, c_out] - ggml_call_mul_mat(params, patch_n, c_out, knl_n, - tmp, knl_data, gemm_output); + ggml_call_mul_mat(kernel_type, params, patch_n, c_out, knl_n, tmp, knl_data, gemm_output); ggml_barrier(params->threadpool); @@ -6698,7 +6706,6 @@ static void ggml_compute_forward_conv_2d_f32( for (int64_t oc = 0; oc < c_out; ++oc) { const float value = gemm_output[i * c_out + oc]; - // Write to WHCN layout: dst[w, h, c, n] float * dst_ptr = (float *)((char *)dst_data + dst_x * dst->nb[0] + dst_y * dst->nb[1] + oc * dst->nb[2] + batch_n * dst->nb[3]); *dst_ptr = value; } @@ -6713,11 +6720,7 @@ void ggml_compute_forward_conv_2d( const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; - if (src0->type == GGML_TYPE_F16) { - GGML_ASSERT(false && "F16 not supported yet"); - } else { - ggml_compute_forward_conv_2d_f32(params, src0, src1, dst); - } + ggml_compute_forward_conv_2d_impl(params, src0, src1, dst, src0->type); } // ggml_compute_forward_conv_transpose_2d From 29d77dc03cf59b45386783703f3f32e123c72f99 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Sun, 29 Jun 2025 11:27:35 +0800 Subject: [PATCH 7/8] add assert about size --- ggml/src/ggml-cpu/ops.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 5839945b405a3..39eb2e3aadfef 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -6645,7 +6645,7 @@ static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params const int64_t patch_per_thread = (patch_n + params->nth - 1) / params->nth; const int64_t patch_start = patch_start_batch + params->ith * patch_per_thread; - const int64_t patch_end = std::min(patch_start + patch_per_thread,patch_end_batch); + const int64_t patch_end = std::min(patch_start + patch_per_thread, patch_end_batch); //im2col for a patch for (int64_t p = patch_start; p < patch_end; ++p) { @@ -6687,6 +6687,8 @@ static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params float * gemm_output = (float *) ((char *) tmp + patches_per_batch * knl_n * traits->type_size); + GGML_ASSERT(gemm_output + patch_n * c_out <= (float*)tmp + params->wsize); + // GEMM: patches[patch_n, knl_n] × kernel[knl_n, c_out] = output[patch_n, c_out] ggml_call_mul_mat(kernel_type, params, patch_n, c_out, knl_n, tmp, knl_data, gemm_output); From 808f526b93124ed5b5470aa9cc0610a04670dcc2 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Mon, 30 Jun 2025 22:27:01 +0800 Subject: [PATCH 8/8] Review: further formatting fixes, add assert and use CPU version of fp32->fp16 --- ggml/src/ggml-cpu/ops.cpp | 27 ++++++++++++++------------- ggml/src/ggml.c | 4 ++-- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 39eb2e3aadfef..6948c00b44a19 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -6546,11 +6546,11 @@ void ggml_compute_forward_im2col_back_f32( } } -static void ggml_call_mul_mat(ggml_type T, const ggml_compute_params * params, int64_t m, int64_t n, int64_t k, - void * a, void * b, void * c) { - const ggml_type_traits * traits = ggml_get_type_traits(T); +static void ggml_call_mul_mat(ggml_type type, const ggml_compute_params * params, int64_t m, int64_t n, int64_t k, + void * a, void * b, float * c) { + const ggml_type_traits * traits = ggml_get_type_traits(type); struct ggml_tensor src1 = {}; - src1.type = T; + src1.type = type; src1.ne[0] = k; src1.ne[1] = m; src1.ne[2] = 1; @@ -6562,7 +6562,7 @@ static void ggml_call_mul_mat(ggml_type T, const ggml_compute_params * params, i src1.data = a; struct ggml_tensor src0 = {}; - src0.type = T; + src0.type = type; src0.ne[0] = k; src0.ne[1] = n; src0.ne[2] = 1; @@ -6598,6 +6598,7 @@ static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params ggml_type kernel_type) { GGML_ASSERT(ggml_is_contiguous(kernel)); + GGML_ASSERT(kernel_type == GGML_TYPE_F16 || kernel_type == GGML_TYPE_F32); GGML_ASSERT(kernel->type == kernel_type); const ggml_type_traits * traits = ggml_get_type_traits(kernel_type); @@ -6620,9 +6621,9 @@ static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params const int64_t dst_w = dst->ne[0]; const int64_t dst_h = dst->ne[1]; - float * src_data = (float*) src->data; - void * knl_data = kernel->data; - float * dst_data = (float*) dst->data; + const float * src_data = (float *) src->data; + void * knl_data = kernel->data; + float * dst_data = (float *) dst->data; const int64_t knl_n = knl_w * knl_h * c_in; const int64_t patch_total = dst->ne[3] * dst_w * dst_h; @@ -6653,8 +6654,8 @@ static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params const int64_t src_x = (p / dst_w) % dst_h; const int64_t src_y = p % dst_w; - float * src_base = (float *)((char *)src_data + batch_n * src->nb[3]); - char * dst_row = (char *) tmp + (p % patches_per_batch) * knl_n * traits->type_size; + const float * src_base = (const float *)((const char *)src_data + batch_n * src->nb[3]); + char * dst_row = (char *) tmp + (p % patches_per_batch) * knl_n * traits->type_size; for (int64_t ic = 0; ic < c_in; ++ic) { for (int64_t ky = 0; ky < knl_h; ++ky) { @@ -6668,15 +6669,15 @@ static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params if (sy < 0 || sy >= src_h || sx < 0 || sx >= src_w) { src_val = 0.0f; } else { - float * src_ptr = (float *)((char *)src_base + sx * src->nb[0] + sy * src->nb[1] + ic * src->nb[2]); - src_val = *src_ptr; + const float * src_ptr = (const float *)((const char *)src_base + sx * src->nb[0] + sy * src->nb[1] + ic * src->nb[2]); + src_val = *src_ptr; } char * element_ptr = dst_row + dst_idx * traits->type_size; if (kernel_type == GGML_TYPE_F32) { *(float *) element_ptr = src_val; } else if (kernel_type == GGML_TYPE_F16) { - *(ggml_fp16_t *) element_ptr = GGML_FP32_TO_FP16(src_val); + *(ggml_fp16_t *) element_ptr = GGML_CPU_FP32_TO_FP16(src_val); } } } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 7d407315fea78..47fe37b13e07d 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -987,7 +987,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "GLU", }; -static_assert(GGML_OP_COUNT == 85, "GGML_OP_COUNT != 85"); +static_assert(GGML_OP_COUNT == 86, "GGML_OP_COUNT != 86"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1087,7 +1087,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "glu(x)", }; -static_assert(GGML_OP_COUNT == 85, "GGML_OP_COUNT != 85"); +static_assert(GGML_OP_COUNT == 86, "GGML_OP_COUNT != 86"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");