From 4e3f47c73ddc728f795973783d81b9d3c93e4e73 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Sun, 22 Jun 2025 02:57:20 +0800
Subject: [PATCH 1/4] Conv2D: Add CPU version

---
 ggml/include/ggml.h          |  12 +++
 ggml/src/ggml-cpu/ggml-cpu.c |   5 ++
 ggml/src/ggml-cpu/ops.cpp    | 157 +++++++++++++++++++++++++++++++++++
 ggml/src/ggml-cpu/ops.h      |   1 +
 ggml/src/ggml.c              |  43 +++++++++-
 5 files changed, 216 insertions(+), 2 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 9c4e24023b5ad..cfd88caf34367 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -481,6 +481,7 @@ extern "C" {
         GGML_OP_CONV_TRANSPOSE_1D,
         GGML_OP_IM2COL,
         GGML_OP_IM2COL_BACK,
+        GGML_OP_CONV_2D,
         GGML_OP_CONV_2D_DW,
         GGML_OP_CONV_TRANSPOSE_2D,
         GGML_OP_POOL_1D,
@@ -1723,6 +1724,17 @@ extern "C" {
             struct ggml_tensor  * b,
             int                   stride);
 
+    GGML_API struct ggml_tensor * ggml_conv_2d_direct(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,   // convolution kernel [KW, KH, IC, OC]
+            struct ggml_tensor  * b,   // input data [W, H, C, N]
+            int                   s0,  // stride dimension 0
+            int                   s1,  // stride dimension 1
+            int                   p0,  // padding dimension 0
+            int                   p1,  // padding dimension 1
+            int                   d0,  // dilation dimension 0
+            int                   d1); // dilation dimension 1
+
     enum ggml_op_pool {
         GGML_OP_POOL_MAX,
         GGML_OP_POOL_AVG,
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 1d3cd009affc6..815dfadb5658f 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1858,6 +1858,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_im2col_back_f32(params, tensor);
             } break;
+        case GGML_OP_CONV_2D:
+            {
+                ggml_compute_forward_conv_2d(params, tensor);
+            } break;
         case GGML_OP_CONV_2D_DW:
             {
                 ggml_compute_forward_conv_2d_dw(params, tensor);
@@ -2203,6 +2207,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
             } break;
         case GGML_OP_IM2COL:
         case GGML_OP_IM2COL_BACK:
+        case GGML_OP_CONV_2D:
         case GGML_OP_CONV_2D_DW:
         case GGML_OP_CONV_TRANSPOSE_1D:
         case GGML_OP_CONV_TRANSPOSE_2D:
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index eff4a53e3442b..f20a328dc63e4 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -6058,6 +6058,163 @@ void ggml_compute_forward_im2col_back_f32(
     }
 }
 
+// ggml_compute_forward_conv_2d
+
+static void ggml_compute_forward_conv_2d_f32(
+        const ggml_compute_params * params,
+        const ggml_tensor * kernel,  // [KW, KH, IC, OC]
+        const ggml_tensor * src,     // [W, H, C, N]
+        ggml_tensor * dst) {         // [OW, OH, OC, N]
+
+    const int32_t s0 = ggml_get_op_params_i32(dst, 0);
+    const int32_t s1 = ggml_get_op_params_i32(dst, 1);
+    const int32_t p0 = ggml_get_op_params_i32(dst, 2);
+    const int32_t p1 = ggml_get_op_params_i32(dst, 3);
+    const int32_t d0 = ggml_get_op_params_i32(dst, 4);
+    const int32_t d1 = ggml_get_op_params_i32(dst, 5);
+
+    const int64_t OW = dst->ne[0];
+    const int64_t OH = dst->ne[1];
+    const int64_t OC = dst->ne[2];
+    const int64_t N  = dst->ne[3];
+
+    const int64_t IW = src->ne[0];
+    const int64_t IH = src->ne[1];
+    const int64_t IC = src->ne[2];
+
+    const int64_t KW = kernel->ne[0];
+    const int64_t KH = kernel->ne[1];
+
+    const float * kernel_data = (const float *)kernel->data;
+    const float * src_data    = (const float *)src->data;
+    float       * dst_data    = (float       *)dst->data;
+
+    const int64_t rows_total = OH * N;
+    const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth;
+    const int64_t row_start = params->ith * rows_per_thread;
+    const int64_t row_end = MIN(row_start + rows_per_thread, rows_total);
+
+    for (int64_t row = row_start; row < row_end; ++row) {
+        const int64_t oh = row % OH;
+        const int64_t n  = row / OH;
+        const float * src_batch = src_data + n * IW * IH * IC;
+
+        for (int64_t ow = 0; ow < OW; ++ow) {
+            for (int64_t oc = 0; oc < OC; ++oc) {
+                float sum = 0.0f;
+                const float * kernel_channel = kernel_data + oc * KW * KH * IC;
+
+                for (int64_t kh = 0; kh < KH; ++kh) {
+                    const int64_t ih = oh * s1 - p1 + kh * d1;
+                    if (ih < 0 || ih >= IH) continue;
+
+                    for (int64_t kw = 0; kw < KW; ++kw) {
+                        const int64_t iw = ow * s0 - p0 + kw * d0;
+                        if (iw < 0 || iw >= IW) continue;
+
+                        #pragma omp simd
+                        for (int64_t ic = 0; ic < IC; ++ic) {
+                            const float * kernel_ptr = kernel_channel + (kh * KW + kw) + ic * KW * KH;
+                            const float * src_ptr = src_batch + (ih * IW + iw) + ic * IW * IH;
+                            sum += (*kernel_ptr) * (*src_ptr);
+                        }
+                    }
+                }
+
+                dst_data[((n * OC + oc) * OH + oh) * OW + ow] = sum;
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_conv_2d_f16(
+        const ggml_compute_params * params,
+        const ggml_tensor * kernel,  // [KW, KH, IC, OC]
+        const ggml_tensor * src,     // [W, H, C, N]
+        ggml_tensor * dst) {         // [OW, OH, OC, N]
+
+    const int32_t s0 = ggml_get_op_params_i32(dst, 0);
+    const int32_t s1 = ggml_get_op_params_i32(dst, 1);
+    const int32_t p0 = ggml_get_op_params_i32(dst, 2);
+    const int32_t p1 = ggml_get_op_params_i32(dst, 3);
+    const int32_t d0 = ggml_get_op_params_i32(dst, 4);
+    const int32_t d1 = ggml_get_op_params_i32(dst, 5);
+
+    const int64_t OW = dst->ne[0];
+    const int64_t OH = dst->ne[1];
+    const int64_t OC = dst->ne[2];
+    const int64_t N  = dst->ne[3];
+
+    const int64_t IW = src->ne[0];
+    const int64_t IH = src->ne[1];
+    const int64_t IC = src->ne[2];
+
+    const int64_t KW = kernel->ne[0];
+    const int64_t KH = kernel->ne[1];
+
+    const ggml_fp16_t * kernel_data = (const ggml_fp16_t *)kernel->data;
+    const ggml_fp16_t * src_data    = (const ggml_fp16_t *)src->data;
+    ggml_fp16_t       * dst_data    = (ggml_fp16_t       *)dst->data;
+
+    const int64_t rows_total = OH * N;
+    const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth;
+    const int64_t row_start = params->ith * rows_per_thread;
+    const int64_t row_end = MIN(row_start + rows_per_thread, rows_total);
+
+    for (int64_t row = row_start; row < row_end; ++row) {
+        const int64_t oh = row % OH;
+        const int64_t n  = row / OH;
+        const ggml_fp16_t * src_batch = src_data + n * IW * IH * IC;
+
+        for (int64_t ow = 0; ow < OW; ++ow) {
+            for (int64_t oc = 0; oc < OC; ++oc) {
+                float sum = 0.0f;
+                const ggml_fp16_t * kernel_channel = kernel_data + oc * KW * KH * IC;
+                for (int64_t kh = 0; kh < KH; ++kh) {
+                    const int64_t ih = oh * s1 - p1 + kh * d1;
+                    if (ih < 0 || ih >= IH) continue;
+
+                    for (int64_t kw = 0; kw < KW; ++kw) {
+                        const int64_t iw = ow * s0 - p0 + kw * d0;
+                        if (iw < 0 || iw >= IW) continue;
+
+                        for (int64_t ic = 0; ic < IC; ++ic) {
+                            const ggml_fp16_t * kernel_ptr = kernel_channel + (kh * KW + kw) + ic * KW * KH;
+                            const ggml_fp16_t * src_ptr = src_batch + (ih * IW + iw) + ic * IW * IH;
+                            sum += GGML_FP16_TO_FP32(*kernel_ptr) * GGML_FP16_TO_FP32(*src_ptr);
+                        }
+                    }
+                }
+
+                dst_data[((n * OC + oc) * OH + oh) * OW + ow] = GGML_FP32_TO_FP16(sum);
+            }
+        }
+    }
+}
+
+void ggml_compute_forward_conv_2d(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_conv_2d_f16(params, src0, src1, dst);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_conv_2d_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
 // ggml_compute_forward_conv_transpose_2d
 
 void ggml_compute_forward_conv_transpose_2d(
diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
index 2d8544d7d3d43..588e081e94068 100644
--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
@@ -64,6 +64,7 @@ void ggml_compute_forward_clamp(const struct ggml_compute_params * params, struc
 void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index f8e7c595bce15..1283aa41bd41a 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -986,7 +986,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "OPT_STEP_ADAMW",
 };
 
-static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
+static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -1043,6 +1043,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "conv_transpose_1d(x)",
     "im2col(x)",
     "im2col_back(x)",
+    "conv_2d(x)",
     "conv_2d_dw(x)",
     "conv_transpose_2d(x)",
     "pool_1d(x)",
@@ -1082,7 +1083,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "adamw(x)",
 };
 
-static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
+static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
@@ -4131,6 +4132,44 @@ struct ggml_tensor * ggml_conv_2d_dw_direct(
     return result;
 }
 
+// ggml_conv_2d_direct
+
+struct ggml_tensor * ggml_conv_2d_direct(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,   // convolution kernel [KW, KH, IC, OC]
+        struct ggml_tensor  * b,   // input data [W, H, C, N]
+        int                   s0,  // stride dimension 0
+        int                   s1,  // stride dimension 1
+        int                   p0,  // padding dimension 0
+        int                   p1,  // padding dimension 1
+        int                   d0,  // dilation dimension 0
+        int                   d1) {// dilation dimension 1
+
+    GGML_ASSERT(a->ne[2] == b->ne[2]);
+    GGML_ASSERT(a->type == b->type);
+
+    int64_t ne[4];
+    ne[0] = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
+    ne[1] = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1);
+    ne[2] = a->ne[3];
+    ne[3] = b->ne[3];
+
+    struct ggml_tensor * result = ggml_new_tensor(ctx, b->type, 4, ne);
+
+    ggml_set_op_params_i32(result, 0, s0);
+    ggml_set_op_params_i32(result, 1, s1);
+    ggml_set_op_params_i32(result, 2, p0);
+    ggml_set_op_params_i32(result, 3, p1);
+    ggml_set_op_params_i32(result, 4, d0);
+    ggml_set_op_params_i32(result, 5, d1);
+
+    result->op = GGML_OP_CONV_2D;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
 // ggml_conv_transpose_2d_p0
 
 static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {

From 870b6509dd2a9c6f691d8996776273724059d90a Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Thu, 26 Jun 2025 14:53:10 +0800
Subject: [PATCH 2/4] Half decent

---
 ggml/src/ggml-cpu/ggml-cpu.c |  12 ++-
 ggml/src/ggml-cpu/ops.cpp    | 200 ++++++++++++++++++++++++++---------
 ggml/src/ggml-cpu/ops.h      |   4 +
 ggml/src/ggml.c              |   2 +-
 4 files changed, 168 insertions(+), 50 deletions(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 815dfadb5658f..14a1934e06d84 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -683,6 +683,10 @@ static void ggml_init_arm_arch_features(void) {
 
 #endif // __ARM_ARCH
 
+void ggml_compute_forward_mul_mat(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst);
+
 struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
     GGML_ASSERT(!ggml_get_no_alloc(ctx));
 
@@ -1189,7 +1193,7 @@ static void ggml_compute_forward_mul_mat_one_chunk(
     }
 }
 
-static void ggml_compute_forward_mul_mat(
+void ggml_compute_forward_mul_mat(
         const struct ggml_compute_params * params,
               struct ggml_tensor * dst) {
 
@@ -2726,6 +2730,12 @@ struct ggml_cplan ggml_graph_plan(
                             GGML_ABORT("fatal error");
                         }
                     } break;
+                case GGML_OP_CONV_2D:
+                    {
+                        cur = GGML_IM2COL_WORK_SIZE;
+                        //Add enough space for kernel transpose
+                        cur += sizeof(ggml_fp16_t)*node->src[1]->ne[0]*node->src[1]->ne[1]*node->src[1]->ne[2]*node->src[1]->ne[3];
+                    } break;
                 case GGML_OP_CONV_TRANSPOSE_2D:
                     {
                         const int64_t ne00 = node->src[0]->ne[0]; // W
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index f20a328dc63e4..0c2ce34ded0e4 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -3,6 +3,7 @@
 #include "ggml-cpu.h"
 #include "ggml-impl.h"
 #include "binary-ops.h"
+#include "ggml.h"
 #include "unary-ops.h"
 #include "vec.h"
 
@@ -6058,70 +6059,173 @@ void ggml_compute_forward_im2col_back_f32(
     }
 }
 
+static void ggml_call_mul_mat(
+    const ggml_compute_params * params,
+    int64_t m, int64_t n, int64_t k,
+    void * a, void * b, void * c) {
+
+    struct ggml_tensor src1 = {};
+    src1.ne[0] = k;
+    src1.ne[1] = m;
+    src1.ne[2] = 1;
+    src1.ne[3] = 1;
+    src1.nb[0] = sizeof(float);
+    src1.nb[1] = k * sizeof(float);
+    src1.nb[2] = src1.nb[1];
+    src1.nb[3] = src1.nb[2];
+    src1.data  = a;
+
+    struct ggml_tensor src0 = {};
+    src0.ne[0] = k;
+    src0.ne[1] = n;
+    src0.ne[2] = 1;
+    src0.ne[3] = 1;
+    src0.nb[0] = sizeof(float);
+    src0.nb[1] = k * sizeof(float);
+    src0.nb[2] = src0.nb[1];
+    src0.nb[3] = src0.nb[2];
+    src0.data  = b;
+
+    struct ggml_tensor dst = {};
+    dst.ne[0] = n;
+    dst.ne[1] = m;
+    dst.ne[2] = 1;
+    dst.ne[3] = 1;
+    dst.nb[0] = sizeof(float);
+    dst.nb[1] = n * sizeof(float);
+    dst.nb[2] = dst.nb[1];
+    dst.nb[3] = dst.nb[2];
+    dst.data  = c;
+    dst.src[0] = &src0;
+    dst.src[1] = &src1;
+
+    ggml_compute_forward_mul_mat(params, &dst);
+}
+
+
 // ggml_compute_forward_conv_2d
 
-static void ggml_compute_forward_conv_2d_f32(
-        const ggml_compute_params * params,
-        const ggml_tensor * kernel,  // [KW, KH, IC, OC]
-        const ggml_tensor * src,     // [W, H, C, N]
-        ggml_tensor * dst) {         // [OW, OH, OC, N]
+static void ggml_compute_forward_conv_2d_f32(const ggml_compute_params * params,
+                                       ggml_tensor        * dst) {
 
-    const int32_t s0 = ggml_get_op_params_i32(dst, 0);
-    const int32_t s1 = ggml_get_op_params_i32(dst, 1);
-    const int32_t p0 = ggml_get_op_params_i32(dst, 2);
-    const int32_t p1 = ggml_get_op_params_i32(dst, 3);
-    const int32_t d0 = ggml_get_op_params_i32(dst, 4);
-    const int32_t d1 = ggml_get_op_params_i32(dst, 5);
+    const ggml_tensor * src    = dst->src[1];      // [W H C_in N]
+    const ggml_tensor * kernel = dst->src[0];      // [W H C_in C_out]
 
-    const int64_t OW = dst->ne[0];
-    const int64_t OH = dst->ne[1];
-    const int64_t OC = dst->ne[2];
-    const int64_t N  = dst->ne[3];
+    GGML_ASSERT(ggml_is_contiguous(kernel));
 
-    const int64_t IW = src->ne[0];
-    const int64_t IH = src->ne[1];
-    const int64_t IC = src->ne[2];
+    const int32_t stride_x = dst->op_params[0];
+    const int32_t stride_y = dst->op_params[1];
+    const int32_t pad_x    = dst->op_params[2];
+    const int32_t pad_y    = dst->op_params[3];
 
-    const int64_t KW = kernel->ne[0];
-    const int64_t KH = kernel->ne[1];
+    const int64_t c_in  = src->ne[2];
+    const int64_t c_out = kernel->ne[3];
+    GGML_ASSERT(c_in == kernel->ne[2]);
 
-    const float * kernel_data = (const float *)kernel->data;
-    const float * src_data    = (const float *)src->data;
-    float       * dst_data    = (float       *)dst->data;
+    const int64_t src_w = src->ne[0];
+    const int64_t src_h = src->ne[1];
+    const int64_t knl_w = kernel->ne[0];
+    const int64_t knl_h = kernel->ne[1];
+    const int64_t dst_w = dst->ne[0];
+    const int64_t dst_h = dst->ne[1];
 
-    const int64_t rows_total = OH * N;
-    const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth;
-    const int64_t row_start = params->ith * rows_per_thread;
-    const int64_t row_end = MIN(row_start + rows_per_thread, rows_total);
 
-    for (int64_t row = row_start; row < row_end; ++row) {
-        const int64_t oh = row % OH;
-        const int64_t n  = row / OH;
-        const float * src_batch = src_data + n * IW * IH * IC;
+    float * src_data = (float *) src->data;
+    float * knl_data = (float *) kernel->data;
+    float * dst_data = (      float *) dst->data;
 
-        for (int64_t ow = 0; ow < OW; ++ow) {
-            for (int64_t oc = 0; oc < OC; ++oc) {
-                float sum = 0.0f;
-                const float * kernel_channel = kernel_data + oc * KW * KH * IC;
 
-                for (int64_t kh = 0; kh < KH; ++kh) {
-                    const int64_t ih = oh * s1 - p1 + kh * d1;
-                    if (ih < 0 || ih >= IH) continue;
+    const int64_t knl_n           = knl_w * knl_h * c_in;
+    const int64_t patch_total     = dst->ne[3] * dst_w * dst_h;
+    
 
-                    for (int64_t kw = 0; kw < KW; ++kw) {
-                        const int64_t iw = ow * s0 - p0 + kw * d0;
-                        if (iw < 0 || iw >= IW) continue;
+    
+    const int64_t space_per_patch = knl_n * sizeof(float) + patch_total * c_out * sizeof(float);
 
-                        #pragma omp simd
-                        for (int64_t ic = 0; ic < IC; ++ic) {
-                            const float * kernel_ptr = kernel_channel + (kh * KW + kw) + ic * KW * KH;
-                            const float * src_ptr = src_batch + (ih * IW + iw) + ic * IW * IH;
-                            sum += (*kernel_ptr) * (*src_ptr);
+    const int64_t batch_size = params->wsize / space_per_patch;
+    const int64_t patches_per_batch = batch_size > 8 ? (batch_size / 8) * 8 : batch_size;
+    const int64_t batch_n = (patch_total + patches_per_batch - 1) / patches_per_batch;
+
+
+    GGML_ASSERT(patches_per_batch > 0 && batch_size >= 1);
+
+    float * tmp = (float *) params->wdata;          // per-thread scratch
+
+    for (int64_t batch_i = 0; batch_i < batch_n; ++batch_i) {
+
+        const int64_t patch_start_batch = batch_i * patches_per_batch;
+        const int64_t patch_end_batch   = std::min(patch_start_batch + patches_per_batch,
+                                              patch_total);
+        const int64_t patch_n = patch_end_batch - patch_start_batch;
+
+        const int64_t patch_per_thread =
+            (patch_n + params->nth - 1) / params->nth;
+        const int64_t patch_start = patch_start_batch +
+                                    params->ith * patch_per_thread;
+        const int64_t patch_end   = std::min(patch_start + patch_per_thread,
+                                        patch_end_batch);
+
+        //im2col for a patch
+        for (int64_t p = patch_start; p < patch_end; ++p) {
+            const int64_t  b     =  p / (dst_w * dst_h);
+            const int64_t  dy    = (p / dst_w) % dst_h;
+            const int64_t  dx    =  p % dst_w;
+
+            const float  * src_base = (const float *)((char *)src_data + b * src->nb[3]);
+            float        * out_row = tmp + (p % patches_per_batch) * knl_n;
+
+            // Extract patch in IC,KH,KW order (same as im2col)
+            for (int64_t ic = 0; ic < c_in; ++ic) {
+                for (int64_t ky = 0; ky < knl_h; ++ky) {
+                    for (int64_t kx = 0; kx < knl_w; ++kx) {
+                        const int64_t sy = dy * stride_y + ky - pad_y;
+                        const int64_t sx = dx * stride_x + kx - pad_x;
+                        
+                        int64_t dst_idx = ic * (knl_h * knl_w) + ky * knl_w + kx;
+                        
+                        if (sy < 0 || sy >= src_h || sx < 0 || sx >= src_w) {
+                            out_row[dst_idx] = 0.0f;
+                        } else {
+                            float * src_ptr = (float *)((char *)src_base + 
+                                sx * src->nb[0] + sy * src->nb[1] + ic * src->nb[2]);
+                            out_row[dst_idx] = *src_ptr;
                         }
                     }
                 }
+            }
+        }   // patches handled by this thread
+
+        ggml_barrier(params->threadpool);   // wait for all threads
 
-                dst_data[((n * OC + oc) * OH + oh) * OW + ow] = sum;
+        //GEMM output is patch_n * cout 
+        float * gemm_output = tmp + patches_per_batch * knl_n;
+        
+        // GEMM: patches[patch_n, knl_n] × kernel[knl_n, c_out] = output[patch_n, c_out]
+        ggml_call_mul_mat(params, patch_n, c_out, knl_n,
+                            tmp, knl_data, gemm_output);
+        
+        // Barrier to ensure GEMM completes before permutation
+        ggml_barrier(params->threadpool);
+        
+        // Distribute permutation work across threads
+        const int64_t permute_per_thread = (patch_n + params->nth - 1) / params->nth;
+        const int64_t permute_start = params->ith * permute_per_thread;
+        const int64_t permute_end = std::min(permute_start + permute_per_thread, patch_n);
+        
+        // Each thread handles part of the permutation from [patch_n, c_out] to WHCN layout
+        for (int64_t i = permute_start; i < permute_end; ++i) {
+            const int64_t p = patch_start_batch + i;
+            const int64_t b  = p / (dst_w * dst_h);         // batch index
+            const int64_t dy = (p / dst_w) % dst_h;         // height index  
+            const int64_t dx = p % dst_w;                   // width index
+            
+            // Copy all channels for this spatial position
+            for (int64_t oc = 0; oc < c_out; ++oc) {
+                const float value = gemm_output[i * c_out + oc];
+                // Write to WHCN layout: dst[w, h, c, n]
+                float * dst_ptr = (float *)((char *)dst_data + 
+                    dx * dst->nb[0] + dy * dst->nb[1] + oc * dst->nb[2] + b * dst->nb[3]);
+                *dst_ptr = value;
             }
         }
     }
@@ -6206,7 +6310,7 @@ void ggml_compute_forward_conv_2d(
             } break;
         case GGML_TYPE_F32:
             {
-                ggml_compute_forward_conv_2d_f32(params, src0, src1, dst);
+                ggml_compute_forward_conv_2d_f32(params, dst);
             } break;
         default:
             {
diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
index 588e081e94068..6bcbb0a2fab88 100644
--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
@@ -20,6 +20,9 @@
 
 static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
 
+// Work buffer size for im2col operations in CONV2D
+#define GGML_IM2COL_WORK_SIZE (16 * 1024 * 1024)  // 16MB work buffer
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -106,6 +109,7 @@ void ggml_compute_forward_custom(const struct ggml_compute_params * params, stru
 void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 
 #ifdef __cplusplus
 }
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 1283aa41bd41a..db562bdf53f96 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -4146,7 +4146,7 @@ struct ggml_tensor * ggml_conv_2d_direct(
         int                   d1) {// dilation dimension 1
 
     GGML_ASSERT(a->ne[2] == b->ne[2]);
-    GGML_ASSERT(a->type == b->type);
+    //GGML_ASSERT(a->type == b->type);
 
     int64_t ne[4];
     ne[0] = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);

From bde9061c44d7dc069e63649e6cce5b2700a39c2a Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Thu, 26 Jun 2025 17:50:25 +0800
Subject: [PATCH 3/4] Tiled approach for F32

---
 ggml/src/ggml-cpu/ops.cpp | 190 +++++++++++---------------------------
 tests/CMakeLists.txt      |   1 +
 2 files changed, 53 insertions(+), 138 deletions(-)

diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 0c2ce34ded0e4..5f55f5f8f8788 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -6105,18 +6105,21 @@ static void ggml_call_mul_mat(
 
 // ggml_compute_forward_conv_2d
 
-static void ggml_compute_forward_conv_2d_f32(const ggml_compute_params * params,
-                                       ggml_tensor        * dst) {
-
-    const ggml_tensor * src    = dst->src[1];      // [W H C_in N]
-    const ggml_tensor * kernel = dst->src[0];      // [W H C_in C_out]
+static void ggml_compute_forward_conv_2d_f32(
+        const ggml_compute_params * params,
+        const ggml_tensor * kernel,  // [KW, KH, IC, OC] - fp32
+        const ggml_tensor * src,     // [W, H, C, N]
+        ggml_tensor * dst) {         // [OW, OH, OC, N]
 
     GGML_ASSERT(ggml_is_contiguous(kernel));
+    GGML_ASSERT(kernel->type == GGML_TYPE_F32);
 
-    const int32_t stride_x = dst->op_params[0];
-    const int32_t stride_y = dst->op_params[1];
-    const int32_t pad_x    = dst->op_params[2];
-    const int32_t pad_y    = dst->op_params[3];
+    const int32_t stride_x   = dst->op_params[0];
+    const int32_t stride_y   = dst->op_params[1];
+    const int32_t pad_x      = dst->op_params[2];
+    const int32_t pad_y      = dst->op_params[3];
+    const int32_t dilation_x = dst->op_params[4];
+    const int32_t dilation_y = dst->op_params[5];
 
     const int64_t c_in  = src->ne[2];
     const int64_t c_out = kernel->ne[3];
@@ -6129,173 +6132,93 @@ static void ggml_compute_forward_conv_2d_f32(const ggml_compute_params * params,
     const int64_t dst_w = dst->ne[0];
     const int64_t dst_h = dst->ne[1];
 
-
-    float * src_data = (float *) src->data;
-    float * knl_data = (float *) kernel->data;
-    float * dst_data = (      float *) dst->data;
-
+    float * src_data = (float*) src->data;
+    float * knl_data = (float*) kernel->data;
+    float * dst_data = (float*) dst->data;
 
     const int64_t knl_n           = knl_w * knl_h * c_in;
     const int64_t patch_total     = dst->ne[3] * dst_w * dst_h;
-    
-
-    
-    const int64_t space_per_patch = knl_n * sizeof(float) + patch_total * c_out * sizeof(float);
 
-    const int64_t batch_size = params->wsize / space_per_patch;
+    const int64_t space_per_patch   = knl_n * sizeof(float) + c_out * sizeof(float);
+    const int64_t batch_size        = params->wsize / space_per_patch;
     const int64_t patches_per_batch = batch_size > 8 ? (batch_size / 8) * 8 : batch_size;
-    const int64_t batch_n = (patch_total + patches_per_batch - 1) / patches_per_batch;
-
+    const int64_t batch_n           = (patch_total + patches_per_batch - 1) / patches_per_batch;
 
     GGML_ASSERT(patches_per_batch > 0 && batch_size >= 1);
 
-    float * tmp = (float *) params->wdata;          // per-thread scratch
+    float * tmp = (float *) params->wdata;
 
     for (int64_t batch_i = 0; batch_i < batch_n; ++batch_i) {
 
         const int64_t patch_start_batch = batch_i * patches_per_batch;
         const int64_t patch_end_batch   = std::min(patch_start_batch + patches_per_batch,
                                               patch_total);
-        const int64_t patch_n = patch_end_batch - patch_start_batch;
+        const int64_t patch_n           = patch_end_batch - patch_start_batch;
 
-        const int64_t patch_per_thread =
-            (patch_n + params->nth - 1) / params->nth;
-        const int64_t patch_start = patch_start_batch +
-                                    params->ith * patch_per_thread;
-        const int64_t patch_end   = std::min(patch_start + patch_per_thread,
-                                        patch_end_batch);
+        const int64_t patch_per_thread  = (patch_n + params->nth - 1) / params->nth;
+        const int64_t patch_start       = patch_start_batch + params->ith * patch_per_thread;
+        const int64_t patch_end         = std::min(patch_start + patch_per_thread,patch_end_batch);
 
         //im2col for a patch
         for (int64_t p = patch_start; p < patch_end; ++p) {
-            const int64_t  b     =  p / (dst_w * dst_h);
-            const int64_t  dy    = (p / dst_w) % dst_h;
-            const int64_t  dx    =  p % dst_w;
+            const int64_t  batch_n     =  p / (dst_w * dst_h);
+            const int64_t  src_x       = (p / dst_w) % dst_h;
+            const int64_t  src_y       =  p % dst_w;
 
-            const float  * src_base = (const float *)((char *)src_data + b * src->nb[3]);
-            float        * out_row = tmp + (p % patches_per_batch) * knl_n;
+            float * src_base = (float *)((char *)src_data + batch_n * src->nb[3]);
+            float * dst_row  = tmp + (p % patches_per_batch) * knl_n;
 
-            // Extract patch in IC,KH,KW order (same as im2col)
             for (int64_t ic = 0; ic < c_in; ++ic) {
                 for (int64_t ky = 0; ky < knl_h; ++ky) {
                     for (int64_t kx = 0; kx < knl_w; ++kx) {
-                        const int64_t sy = dy * stride_y + ky - pad_y;
-                        const int64_t sx = dx * stride_x + kx - pad_x;
-                        
+                        const int64_t sy = src_x * stride_y + ky * dilation_y - pad_y;
+                        const int64_t sx = src_y * stride_x + kx * dilation_x - pad_x;
+
                         int64_t dst_idx = ic * (knl_h * knl_w) + ky * knl_w + kx;
-                        
+
                         if (sy < 0 || sy >= src_h || sx < 0 || sx >= src_w) {
-                            out_row[dst_idx] = 0.0f;
+                            dst_row[dst_idx] = 0.0f;
                         } else {
-                            float * src_ptr = (float *)((char *)src_base + 
-                                sx * src->nb[0] + sy * src->nb[1] + ic * src->nb[2]);
-                            out_row[dst_idx] = *src_ptr;
+                            float * src_ptr = (float *)((char *)src_base + sx * src->nb[0] + sy * src->nb[1] + ic * src->nb[2]);
+                            dst_row[dst_idx] = *src_ptr;
                         }
                     }
                 }
             }
         }   // patches handled by this thread
 
-        ggml_barrier(params->threadpool);   // wait for all threads
+        ggml_barrier(params->threadpool);
 
-        //GEMM output is patch_n * cout 
         float * gemm_output = tmp + patches_per_batch * knl_n;
-        
+
         // GEMM: patches[patch_n, knl_n] × kernel[knl_n, c_out] = output[patch_n, c_out]
         ggml_call_mul_mat(params, patch_n, c_out, knl_n,
                             tmp, knl_data, gemm_output);
-        
-        // Barrier to ensure GEMM completes before permutation
+
         ggml_barrier(params->threadpool);
-        
-        // Distribute permutation work across threads
+
+
+        //permute back [OC, N, OH, OW] to [N, OC, OH, OW]
         const int64_t permute_per_thread = (patch_n + params->nth - 1) / params->nth;
         const int64_t permute_start = params->ith * permute_per_thread;
         const int64_t permute_end = std::min(permute_start + permute_per_thread, patch_n);
-        
-        // Each thread handles part of the permutation from [patch_n, c_out] to WHCN layout
+
         for (int64_t i = permute_start; i < permute_end; ++i) {
-            const int64_t p = patch_start_batch + i;
-            const int64_t b  = p / (dst_w * dst_h);         // batch index
-            const int64_t dy = (p / dst_w) % dst_h;         // height index  
-            const int64_t dx = p % dst_w;                   // width index
-            
-            // Copy all channels for this spatial position
+            const int64_t p       = patch_start_batch + i;
+            const int64_t batch_n = p / (dst_w * dst_h);
+            const int64_t dst_y   = (p / dst_w) % dst_h;
+            const int64_t dst_x   = p % dst_w;
+
             for (int64_t oc = 0; oc < c_out; ++oc) {
                 const float value = gemm_output[i * c_out + oc];
                 // Write to WHCN layout: dst[w, h, c, n]
-                float * dst_ptr = (float *)((char *)dst_data + 
-                    dx * dst->nb[0] + dy * dst->nb[1] + oc * dst->nb[2] + b * dst->nb[3]);
+                float * dst_ptr = (float *)((char *)dst_data + dst_x * dst->nb[0] + dst_y * dst->nb[1] + oc * dst->nb[2] + batch_n * dst->nb[3]);
                 *dst_ptr = value;
             }
         }
     }
 }
 
-static void ggml_compute_forward_conv_2d_f16(
-        const ggml_compute_params * params,
-        const ggml_tensor * kernel,  // [KW, KH, IC, OC]
-        const ggml_tensor * src,     // [W, H, C, N]
-        ggml_tensor * dst) {         // [OW, OH, OC, N]
-
-    const int32_t s0 = ggml_get_op_params_i32(dst, 0);
-    const int32_t s1 = ggml_get_op_params_i32(dst, 1);
-    const int32_t p0 = ggml_get_op_params_i32(dst, 2);
-    const int32_t p1 = ggml_get_op_params_i32(dst, 3);
-    const int32_t d0 = ggml_get_op_params_i32(dst, 4);
-    const int32_t d1 = ggml_get_op_params_i32(dst, 5);
-
-    const int64_t OW = dst->ne[0];
-    const int64_t OH = dst->ne[1];
-    const int64_t OC = dst->ne[2];
-    const int64_t N  = dst->ne[3];
-
-    const int64_t IW = src->ne[0];
-    const int64_t IH = src->ne[1];
-    const int64_t IC = src->ne[2];
-
-    const int64_t KW = kernel->ne[0];
-    const int64_t KH = kernel->ne[1];
-
-    const ggml_fp16_t * kernel_data = (const ggml_fp16_t *)kernel->data;
-    const ggml_fp16_t * src_data    = (const ggml_fp16_t *)src->data;
-    ggml_fp16_t       * dst_data    = (ggml_fp16_t       *)dst->data;
-
-    const int64_t rows_total = OH * N;
-    const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth;
-    const int64_t row_start = params->ith * rows_per_thread;
-    const int64_t row_end = MIN(row_start + rows_per_thread, rows_total);
-
-    for (int64_t row = row_start; row < row_end; ++row) {
-        const int64_t oh = row % OH;
-        const int64_t n  = row / OH;
-        const ggml_fp16_t * src_batch = src_data + n * IW * IH * IC;
-
-        for (int64_t ow = 0; ow < OW; ++ow) {
-            for (int64_t oc = 0; oc < OC; ++oc) {
-                float sum = 0.0f;
-                const ggml_fp16_t * kernel_channel = kernel_data + oc * KW * KH * IC;
-                for (int64_t kh = 0; kh < KH; ++kh) {
-                    const int64_t ih = oh * s1 - p1 + kh * d1;
-                    if (ih < 0 || ih >= IH) continue;
-
-                    for (int64_t kw = 0; kw < KW; ++kw) {
-                        const int64_t iw = ow * s0 - p0 + kw * d0;
-                        if (iw < 0 || iw >= IW) continue;
-
-                        for (int64_t ic = 0; ic < IC; ++ic) {
-                            const ggml_fp16_t * kernel_ptr = kernel_channel + (kh * KW + kw) + ic * KW * KH;
-                            const ggml_fp16_t * src_ptr = src_batch + (ih * IW + iw) + ic * IW * IH;
-                            sum += GGML_FP16_TO_FP32(*kernel_ptr) * GGML_FP16_TO_FP32(*src_ptr);
-                        }
-                    }
-                }
-
-                dst_data[((n * OC + oc) * OH + oh) * OW + ow] = GGML_FP32_TO_FP16(sum);
-            }
-        }
-    }
-}
-
 void ggml_compute_forward_conv_2d(
         const ggml_compute_params * params,
         ggml_tensor * dst) {
@@ -6303,19 +6226,10 @@ void ggml_compute_forward_conv_2d(
     const ggml_tensor * src0 = dst->src[0];
     const ggml_tensor * src1 = dst->src[1];
 
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_conv_2d_f16(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_conv_2d_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
+    if (src0->type == GGML_TYPE_F16) {
+        GGML_ASSERT(false && "F16 not supported yet");
+    } else {
+        ggml_compute_forward_conv_2d_f32(params, src0, src1, dst);
     }
 }
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index fc1557a2d4065..517cc7a945e25 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -195,6 +195,7 @@ endif()
 # llama_build_and_test(test-opt.cpp) # SLOW
 llama_build_and_test(test-gguf.cpp)
 llama_build_and_test(test-backend-ops.cpp)
+llama_build_and_test(test_conv2d_comparison.cpp)
 
 llama_build_and_test(test-model-load-cancel.cpp  LABEL "model")
 llama_build_and_test(test-autorelease.cpp        LABEL "model")

From b43afa7b180c91fab9b35c772a52119ad480af09 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Thu, 26 Jun 2025 17:54:12 +0800
Subject: [PATCH 4/4] remove file

---
 tests/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 517cc7a945e25..fc1557a2d4065 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -195,7 +195,6 @@ endif()
 # llama_build_and_test(test-opt.cpp) # SLOW
 llama_build_and_test(test-gguf.cpp)
 llama_build_and_test(test-backend-ops.cpp)
-llama_build_and_test(test_conv2d_comparison.cpp)
 
 llama_build_and_test(test-model-load-cancel.cpp  LABEL "model")
 llama_build_and_test(test-autorelease.cpp        LABEL "model")