From f5876f2d5c8a5b953d1744fc44af374acf98f436 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Sun, 22 Jun 2025 02:57:20 +0800
Subject: [PATCH 1/8] Conv2D: Add CPU version

---
 ggml/include/ggml.h          |  12 +++
 ggml/src/ggml-cpu/ggml-cpu.c |   5 ++
 ggml/src/ggml-cpu/ops.cpp    | 157 +++++++++++++++++++++++++++++++++++
 ggml/src/ggml-cpu/ops.h      |   1 +
 ggml/src/ggml.c              |  39 +++++++++
 5 files changed, 214 insertions(+)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index e5dda969a38fe..8bd3ca929285c 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -482,6 +482,7 @@ extern "C" {
         GGML_OP_CONV_TRANSPOSE_1D,
         GGML_OP_IM2COL,
         GGML_OP_IM2COL_BACK,
+        GGML_OP_CONV_2D,
         GGML_OP_CONV_2D_DW,
         GGML_OP_CONV_TRANSPOSE_2D,
         GGML_OP_POOL_1D,
@@ -1813,6 +1814,17 @@ extern "C" {
             struct ggml_tensor  * b,
             int                   stride);
 
+    GGML_API struct ggml_tensor * ggml_conv_2d_direct(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,   // convolution kernel [KW, KH, IC, OC]
+            struct ggml_tensor  * b,   // input data [W, H, C, N]
+            int                   s0,  // stride dimension 0
+            int                   s1,  // stride dimension 1
+            int                   p0,  // padding dimension 0
+            int                   p1,  // padding dimension 1
+            int                   d0,  // dilation dimension 0
+            int                   d1); // dilation dimension 1
+
     enum ggml_op_pool {
         GGML_OP_POOL_MAX,
         GGML_OP_POOL_AVG,
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 1d68cde71a65e..75d830f62b7e1 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1866,6 +1866,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_im2col_back_f32(params, tensor);
             } break;
+        case GGML_OP_CONV_2D:
+            {
+                ggml_compute_forward_conv_2d(params, tensor);
+            } break;
         case GGML_OP_CONV_2D_DW:
             {
                 ggml_compute_forward_conv_2d_dw(params, tensor);
@@ -2228,6 +2232,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
             } break;
         case GGML_OP_IM2COL:
         case GGML_OP_IM2COL_BACK:
+        case GGML_OP_CONV_2D:
         case GGML_OP_CONV_2D_DW:
         case GGML_OP_CONV_TRANSPOSE_1D:
         case GGML_OP_CONV_TRANSPOSE_2D:
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 27586ed1fdb2c..fdc7990d9d153 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -6545,6 +6545,163 @@ void ggml_compute_forward_im2col_back_f32(
     }
 }
 
+// ggml_compute_forward_conv_2d
+
+static void ggml_compute_forward_conv_2d_f32(
+        const ggml_compute_params * params,
+        const ggml_tensor * kernel,  // [KW, KH, IC, OC]
+        const ggml_tensor * src,     // [W, H, C, N]
+        ggml_tensor * dst) {         // [OW, OH, OC, N]
+
+    const int32_t s0 = ggml_get_op_params_i32(dst, 0);
+    const int32_t s1 = ggml_get_op_params_i32(dst, 1);
+    const int32_t p0 = ggml_get_op_params_i32(dst, 2);
+    const int32_t p1 = ggml_get_op_params_i32(dst, 3);
+    const int32_t d0 = ggml_get_op_params_i32(dst, 4);
+    const int32_t d1 = ggml_get_op_params_i32(dst, 5);
+
+    const int64_t OW = dst->ne[0];
+    const int64_t OH = dst->ne[1];
+    const int64_t OC = dst->ne[2];
+    const int64_t N  = dst->ne[3];
+
+    const int64_t IW = src->ne[0];
+    const int64_t IH = src->ne[1];
+    const int64_t IC = src->ne[2];
+
+    const int64_t KW = kernel->ne[0];
+    const int64_t KH = kernel->ne[1];
+
+    const float * kernel_data = (const float *)kernel->data;
+    const float * src_data    = (const float *)src->data;
+    float       * dst_data    = (float       *)dst->data;
+
+    const int64_t rows_total = OH * N;
+    const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth;
+    const int64_t row_start = params->ith * rows_per_thread;
+    const int64_t row_end = MIN(row_start + rows_per_thread, rows_total);
+
+    for (int64_t row = row_start; row < row_end; ++row) {
+        const int64_t oh = row % OH;
+        const int64_t n  = row / OH;
+        const float * src_batch = src_data + n * IW * IH * IC;
+
+        for (int64_t ow = 0; ow < OW; ++ow) {
+            for (int64_t oc = 0; oc < OC; ++oc) {
+                float sum = 0.0f;
+                const float * kernel_channel = kernel_data + oc * KW * KH * IC;
+
+                for (int64_t kh = 0; kh < KH; ++kh) {
+                    const int64_t ih = oh * s1 - p1 + kh * d1;
+                    if (ih < 0 || ih >= IH) continue;
+
+                    for (int64_t kw = 0; kw < KW; ++kw) {
+                        const int64_t iw = ow * s0 - p0 + kw * d0;
+                        if (iw < 0 || iw >= IW) continue;
+
+                        #pragma omp simd
+                        for (int64_t ic = 0; ic < IC; ++ic) {
+                            const float * kernel_ptr = kernel_channel + (kh * KW + kw) + ic * KW * KH;
+                            const float * src_ptr = src_batch + (ih * IW + iw) + ic * IW * IH;
+                            sum += (*kernel_ptr) * (*src_ptr);
+                        }
+                    }
+                }
+
+                dst_data[((n * OC + oc) * OH + oh) * OW + ow] = sum;
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_conv_2d_f16(
+        const ggml_compute_params * params,
+        const ggml_tensor * kernel,  // [KW, KH, IC, OC]
+        const ggml_tensor * src,     // [W, H, C, N]
+        ggml_tensor * dst) {         // [OW, OH, OC, N]
+
+    const int32_t s0 = ggml_get_op_params_i32(dst, 0);
+    const int32_t s1 = ggml_get_op_params_i32(dst, 1);
+    const int32_t p0 = ggml_get_op_params_i32(dst, 2);
+    const int32_t p1 = ggml_get_op_params_i32(dst, 3);
+    const int32_t d0 = ggml_get_op_params_i32(dst, 4);
+    const int32_t d1 = ggml_get_op_params_i32(dst, 5);
+
+    const int64_t OW = dst->ne[0];
+    const int64_t OH = dst->ne[1];
+    const int64_t OC = dst->ne[2];
+    const int64_t N  = dst->ne[3];
+
+    const int64_t IW = src->ne[0];
+    const int64_t IH = src->ne[1];
+    const int64_t IC = src->ne[2];
+
+    const int64_t KW = kernel->ne[0];
+    const int64_t KH = kernel->ne[1];
+
+    const ggml_fp16_t * kernel_data = (const ggml_fp16_t *)kernel->data;
+    const ggml_fp16_t * src_data    = (const ggml_fp16_t *)src->data;
+    ggml_fp16_t       * dst_data    = (ggml_fp16_t       *)dst->data;
+
+    const int64_t rows_total = OH * N;
+    const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth;
+    const int64_t row_start = params->ith * rows_per_thread;
+    const int64_t row_end = MIN(row_start + rows_per_thread, rows_total);
+
+    for (int64_t row = row_start; row < row_end; ++row) {
+        const int64_t oh = row % OH;
+        const int64_t n  = row / OH;
+        const ggml_fp16_t * src_batch = src_data + n * IW * IH * IC;
+
+        for (int64_t ow = 0; ow < OW; ++ow) {
+            for (int64_t oc = 0; oc < OC; ++oc) {
+                float sum = 0.0f;
+                const ggml_fp16_t * kernel_channel = kernel_data + oc * KW * KH * IC;
+                for (int64_t kh = 0; kh < KH; ++kh) {
+                    const int64_t ih = oh * s1 - p1 + kh * d1;
+                    if (ih < 0 || ih >= IH) continue;
+
+                    for (int64_t kw = 0; kw < KW; ++kw) {
+                        const int64_t iw = ow * s0 - p0 + kw * d0;
+                        if (iw < 0 || iw >= IW) continue;
+
+                        for (int64_t ic = 0; ic < IC; ++ic) {
+                            const ggml_fp16_t * kernel_ptr = kernel_channel + (kh * KW + kw) + ic * KW * KH;
+                            const ggml_fp16_t * src_ptr = src_batch + (ih * IW + iw) + ic * IW * IH;
+                            sum += GGML_FP16_TO_FP32(*kernel_ptr) * GGML_FP16_TO_FP32(*src_ptr);
+                        }
+                    }
+                }
+
+                dst_data[((n * OC + oc) * OH + oh) * OW + ow] = GGML_FP32_TO_FP16(sum);
+            }
+        }
+    }
+}
+
+void ggml_compute_forward_conv_2d(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_conv_2d_f16(params, src0, src1, dst);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_conv_2d_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
 // ggml_compute_forward_conv_transpose_2d
 
 void ggml_compute_forward_conv_transpose_2d(
diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
index 5b384e4ba5fce..8d19fc925c2a7 100644
--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
@@ -65,6 +65,7 @@ void ggml_compute_forward_clamp(const struct ggml_compute_params * params, struc
 void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 14000b55aca1e..172a060490f6c 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1044,6 +1044,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "conv_transpose_1d(x)",
     "im2col(x)",
     "im2col_back(x)",
+    "conv_2d(x)",
     "conv_2d_dw(x)",
     "conv_transpose_2d(x)",
     "pool_1d(x)",
@@ -4291,6 +4292,44 @@ struct ggml_tensor * ggml_conv_2d_dw_direct(
     return result;
 }
 
+// ggml_conv_2d_direct
+
+struct ggml_tensor * ggml_conv_2d_direct(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,   // convolution kernel [KW, KH, IC, OC]
+        struct ggml_tensor  * b,   // input data [W, H, C, N]
+        int                   s0,  // stride dimension 0
+        int                   s1,  // stride dimension 1
+        int                   p0,  // padding dimension 0
+        int                   p1,  // padding dimension 1
+        int                   d0,  // dilation dimension 0
+        int                   d1) {// dilation dimension 1
+
+    GGML_ASSERT(a->ne[2] == b->ne[2]);
+    GGML_ASSERT(a->type == b->type);
+
+    int64_t ne[4];
+    ne[0] = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
+    ne[1] = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1);
+    ne[2] = a->ne[3];
+    ne[3] = b->ne[3];
+
+    struct ggml_tensor * result = ggml_new_tensor(ctx, b->type, 4, ne);
+
+    ggml_set_op_params_i32(result, 0, s0);
+    ggml_set_op_params_i32(result, 1, s1);
+    ggml_set_op_params_i32(result, 2, p0);
+    ggml_set_op_params_i32(result, 3, p1);
+    ggml_set_op_params_i32(result, 4, d0);
+    ggml_set_op_params_i32(result, 5, d1);
+
+    result->op = GGML_OP_CONV_2D;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
 // ggml_conv_transpose_2d_p0
 
 static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {

From 48b7fa2fe669aefec685956e63544df4dd1823e4 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Thu, 26 Jun 2025 14:53:10 +0800
Subject: [PATCH 2/8] Half decent

---
 ggml/src/ggml-cpu/ggml-cpu.c |  12 ++-
 ggml/src/ggml-cpu/ops.cpp    | 200 ++++++++++++++++++++++++++---------
 ggml/src/ggml-cpu/ops.h      |   4 +
 ggml/src/ggml.c              |   2 +-
 4 files changed, 168 insertions(+), 50 deletions(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 75d830f62b7e1..edf8222843220 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -687,6 +687,10 @@ static void ggml_init_arm_arch_features(void) {
 
 #endif // __ARM_ARCH
 
+void ggml_compute_forward_mul_mat(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst);
+
 struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
     GGML_ASSERT(!ggml_get_no_alloc(ctx));
 
@@ -1193,7 +1197,7 @@ static void ggml_compute_forward_mul_mat_one_chunk(
     }
 }
 
-static void ggml_compute_forward_mul_mat(
+void ggml_compute_forward_mul_mat(
         const struct ggml_compute_params * params,
               struct ggml_tensor * dst) {
 
@@ -2751,6 +2755,12 @@ struct ggml_cplan ggml_graph_plan(
                             GGML_ABORT("fatal error");
                         }
                     } break;
+                case GGML_OP_CONV_2D:
+                    {
+                        cur = GGML_IM2COL_WORK_SIZE;
+                        //Add enough space for kernel transpose
+                        cur += sizeof(ggml_fp16_t)*node->src[1]->ne[0]*node->src[1]->ne[1]*node->src[1]->ne[2]*node->src[1]->ne[3];
+                    } break;
                 case GGML_OP_CONV_TRANSPOSE_2D:
                     {
                         const int64_t ne00 = node->src[0]->ne[0]; // W
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index fdc7990d9d153..0d82690fc9703 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -3,6 +3,7 @@
 #include "ggml-cpu.h"
 #include "ggml-impl.h"
 #include "binary-ops.h"
+#include "ggml.h"
 #include "unary-ops.h"
 #include "vec.h"
 
@@ -6545,70 +6546,173 @@ void ggml_compute_forward_im2col_back_f32(
     }
 }
 
+static void ggml_call_mul_mat(
+    const ggml_compute_params * params,
+    int64_t m, int64_t n, int64_t k,
+    void * a, void * b, void * c) {
+
+    struct ggml_tensor src1 = {};
+    src1.ne[0] = k;
+    src1.ne[1] = m;
+    src1.ne[2] = 1;
+    src1.ne[3] = 1;
+    src1.nb[0] = sizeof(float);
+    src1.nb[1] = k * sizeof(float);
+    src1.nb[2] = src1.nb[1];
+    src1.nb[3] = src1.nb[2];
+    src1.data  = a;
+
+    struct ggml_tensor src0 = {};
+    src0.ne[0] = k;
+    src0.ne[1] = n;
+    src0.ne[2] = 1;
+    src0.ne[3] = 1;
+    src0.nb[0] = sizeof(float);
+    src0.nb[1] = k * sizeof(float);
+    src0.nb[2] = src0.nb[1];
+    src0.nb[3] = src0.nb[2];
+    src0.data  = b;
+
+    struct ggml_tensor dst = {};
+    dst.ne[0] = n;
+    dst.ne[1] = m;
+    dst.ne[2] = 1;
+    dst.ne[3] = 1;
+    dst.nb[0] = sizeof(float);
+    dst.nb[1] = n * sizeof(float);
+    dst.nb[2] = dst.nb[1];
+    dst.nb[3] = dst.nb[2];
+    dst.data  = c;
+    dst.src[0] = &src0;
+    dst.src[1] = &src1;
+
+    ggml_compute_forward_mul_mat(params, &dst);
+}
+
+
 // ggml_compute_forward_conv_2d
 
-static void ggml_compute_forward_conv_2d_f32(
-        const ggml_compute_params * params,
-        const ggml_tensor * kernel,  // [KW, KH, IC, OC]
-        const ggml_tensor * src,     // [W, H, C, N]
-        ggml_tensor * dst) {         // [OW, OH, OC, N]
+static void ggml_compute_forward_conv_2d_f32(const ggml_compute_params * params,
+                                       ggml_tensor        * dst) {
 
-    const int32_t s0 = ggml_get_op_params_i32(dst, 0);
-    const int32_t s1 = ggml_get_op_params_i32(dst, 1);
-    const int32_t p0 = ggml_get_op_params_i32(dst, 2);
-    const int32_t p1 = ggml_get_op_params_i32(dst, 3);
-    const int32_t d0 = ggml_get_op_params_i32(dst, 4);
-    const int32_t d1 = ggml_get_op_params_i32(dst, 5);
+    const ggml_tensor * src    = dst->src[1];      // [W H C_in N]
+    const ggml_tensor * kernel = dst->src[0];      // [W H C_in C_out]
 
-    const int64_t OW = dst->ne[0];
-    const int64_t OH = dst->ne[1];
-    const int64_t OC = dst->ne[2];
-    const int64_t N  = dst->ne[3];
+    GGML_ASSERT(ggml_is_contiguous(kernel));
 
-    const int64_t IW = src->ne[0];
-    const int64_t IH = src->ne[1];
-    const int64_t IC = src->ne[2];
+    const int32_t stride_x = dst->op_params[0];
+    const int32_t stride_y = dst->op_params[1];
+    const int32_t pad_x    = dst->op_params[2];
+    const int32_t pad_y    = dst->op_params[3];
 
-    const int64_t KW = kernel->ne[0];
-    const int64_t KH = kernel->ne[1];
+    const int64_t c_in  = src->ne[2];
+    const int64_t c_out = kernel->ne[3];
+    GGML_ASSERT(c_in == kernel->ne[2]);
 
-    const float * kernel_data = (const float *)kernel->data;
-    const float * src_data    = (const float *)src->data;
-    float       * dst_data    = (float       *)dst->data;
+    const int64_t src_w = src->ne[0];
+    const int64_t src_h = src->ne[1];
+    const int64_t knl_w = kernel->ne[0];
+    const int64_t knl_h = kernel->ne[1];
+    const int64_t dst_w = dst->ne[0];
+    const int64_t dst_h = dst->ne[1];
 
-    const int64_t rows_total = OH * N;
-    const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth;
-    const int64_t row_start = params->ith * rows_per_thread;
-    const int64_t row_end = MIN(row_start + rows_per_thread, rows_total);
 
-    for (int64_t row = row_start; row < row_end; ++row) {
-        const int64_t oh = row % OH;
-        const int64_t n  = row / OH;
-        const float * src_batch = src_data + n * IW * IH * IC;
+    float * src_data = (float *) src->data;
+    float * knl_data = (float *) kernel->data;
+    float * dst_data = (      float *) dst->data;
 
-        for (int64_t ow = 0; ow < OW; ++ow) {
-            for (int64_t oc = 0; oc < OC; ++oc) {
-                float sum = 0.0f;
-                const float * kernel_channel = kernel_data + oc * KW * KH * IC;
 
-                for (int64_t kh = 0; kh < KH; ++kh) {
-                    const int64_t ih = oh * s1 - p1 + kh * d1;
-                    if (ih < 0 || ih >= IH) continue;
+    const int64_t knl_n           = knl_w * knl_h * c_in;
+    const int64_t patch_total     = dst->ne[3] * dst_w * dst_h;
+    
 
-                    for (int64_t kw = 0; kw < KW; ++kw) {
-                        const int64_t iw = ow * s0 - p0 + kw * d0;
-                        if (iw < 0 || iw >= IW) continue;
+    
+    const int64_t space_per_patch = knl_n * sizeof(float) + patch_total * c_out * sizeof(float);
 
-                        #pragma omp simd
-                        for (int64_t ic = 0; ic < IC; ++ic) {
-                            const float * kernel_ptr = kernel_channel + (kh * KW + kw) + ic * KW * KH;
-                            const float * src_ptr = src_batch + (ih * IW + iw) + ic * IW * IH;
-                            sum += (*kernel_ptr) * (*src_ptr);
+    const int64_t batch_size = params->wsize / space_per_patch;
+    const int64_t patches_per_batch = batch_size > 8 ? (batch_size / 8) * 8 : batch_size;
+    const int64_t batch_n = (patch_total + patches_per_batch - 1) / patches_per_batch;
+
+
+    GGML_ASSERT(patches_per_batch > 0 && batch_size >= 1);
+
+    float * tmp = (float *) params->wdata;          // per-thread scratch
+
+    for (int64_t batch_i = 0; batch_i < batch_n; ++batch_i) {
+
+        const int64_t patch_start_batch = batch_i * patches_per_batch;
+        const int64_t patch_end_batch   = std::min(patch_start_batch + patches_per_batch,
+                                              patch_total);
+        const int64_t patch_n = patch_end_batch - patch_start_batch;
+
+        const int64_t patch_per_thread =
+            (patch_n + params->nth - 1) / params->nth;
+        const int64_t patch_start = patch_start_batch +
+                                    params->ith * patch_per_thread;
+        const int64_t patch_end   = std::min(patch_start + patch_per_thread,
+                                        patch_end_batch);
+
+        //im2col for a patch
+        for (int64_t p = patch_start; p < patch_end; ++p) {
+            const int64_t  b     =  p / (dst_w * dst_h);
+            const int64_t  dy    = (p / dst_w) % dst_h;
+            const int64_t  dx    =  p % dst_w;
+
+            const float  * src_base = (const float *)((char *)src_data + b * src->nb[3]);
+            float        * out_row = tmp + (p % patches_per_batch) * knl_n;
+
+            // Extract patch in IC,KH,KW order (same as im2col)
+            for (int64_t ic = 0; ic < c_in; ++ic) {
+                for (int64_t ky = 0; ky < knl_h; ++ky) {
+                    for (int64_t kx = 0; kx < knl_w; ++kx) {
+                        const int64_t sy = dy * stride_y + ky - pad_y;
+                        const int64_t sx = dx * stride_x + kx - pad_x;
+                        
+                        int64_t dst_idx = ic * (knl_h * knl_w) + ky * knl_w + kx;
+                        
+                        if (sy < 0 || sy >= src_h || sx < 0 || sx >= src_w) {
+                            out_row[dst_idx] = 0.0f;
+                        } else {
+                            float * src_ptr = (float *)((char *)src_base + 
+                                sx * src->nb[0] + sy * src->nb[1] + ic * src->nb[2]);
+                            out_row[dst_idx] = *src_ptr;
                         }
                     }
                 }
+            }
+        }   // patches handled by this thread
+
+        ggml_barrier(params->threadpool);   // wait for all threads
 
-                dst_data[((n * OC + oc) * OH + oh) * OW + ow] = sum;
+        //GEMM output is patch_n * cout 
+        float * gemm_output = tmp + patches_per_batch * knl_n;
+        
+        // GEMM: patches[patch_n, knl_n] × kernel[knl_n, c_out] = output[patch_n, c_out]
+        ggml_call_mul_mat(params, patch_n, c_out, knl_n,
+                            tmp, knl_data, gemm_output);
+        
+        // Barrier to ensure GEMM completes before permutation
+        ggml_barrier(params->threadpool);
+        
+        // Distribute permutation work across threads
+        const int64_t permute_per_thread = (patch_n + params->nth - 1) / params->nth;
+        const int64_t permute_start = params->ith * permute_per_thread;
+        const int64_t permute_end = std::min(permute_start + permute_per_thread, patch_n);
+        
+        // Each thread handles part of the permutation from [patch_n, c_out] to WHCN layout
+        for (int64_t i = permute_start; i < permute_end; ++i) {
+            const int64_t p = patch_start_batch + i;
+            const int64_t b  = p / (dst_w * dst_h);         // batch index
+            const int64_t dy = (p / dst_w) % dst_h;         // height index  
+            const int64_t dx = p % dst_w;                   // width index
+            
+            // Copy all channels for this spatial position
+            for (int64_t oc = 0; oc < c_out; ++oc) {
+                const float value = gemm_output[i * c_out + oc];
+                // Write to WHCN layout: dst[w, h, c, n]
+                float * dst_ptr = (float *)((char *)dst_data + 
+                    dx * dst->nb[0] + dy * dst->nb[1] + oc * dst->nb[2] + b * dst->nb[3]);
+                *dst_ptr = value;
             }
         }
     }
@@ -6693,7 +6797,7 @@ void ggml_compute_forward_conv_2d(
             } break;
         case GGML_TYPE_F32:
             {
-                ggml_compute_forward_conv_2d_f32(params, src0, src1, dst);
+                ggml_compute_forward_conv_2d_f32(params, dst);
             } break;
         default:
             {
diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
index 8d19fc925c2a7..417f49074f4ed 100644
--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
@@ -20,6 +20,9 @@
 
 static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
 
+// Work buffer size for im2col operations in CONV2D
+#define GGML_IM2COL_WORK_SIZE (16 * 1024 * 1024)  // 16MB work buffer
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -108,6 +111,7 @@ void ggml_compute_forward_custom(const struct ggml_compute_params * params, stru
 void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 
 #ifdef __cplusplus
 }
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 172a060490f6c..e4bdd6efa6aad 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -4306,7 +4306,7 @@ struct ggml_tensor * ggml_conv_2d_direct(
         int                   d1) {// dilation dimension 1
 
     GGML_ASSERT(a->ne[2] == b->ne[2]);
-    GGML_ASSERT(a->type == b->type);
+    //GGML_ASSERT(a->type == b->type);
 
     int64_t ne[4];
     ne[0] = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);

From a2c0311038dd4e5a4785c680283463a37631dd9c Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Thu, 26 Jun 2025 17:50:25 +0800
Subject: [PATCH 3/8] Tiled approach for F32

---
 ggml/src/ggml-cpu/ops.cpp | 190 +++++++++++---------------------------
 tests/CMakeLists.txt      |   1 +
 2 files changed, 53 insertions(+), 138 deletions(-)

diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 0d82690fc9703..31310914a6799 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -6592,18 +6592,21 @@ static void ggml_call_mul_mat(
 
 // ggml_compute_forward_conv_2d
 
-static void ggml_compute_forward_conv_2d_f32(const ggml_compute_params * params,
-                                       ggml_tensor        * dst) {
-
-    const ggml_tensor * src    = dst->src[1];      // [W H C_in N]
-    const ggml_tensor * kernel = dst->src[0];      // [W H C_in C_out]
+static void ggml_compute_forward_conv_2d_f32(
+        const ggml_compute_params * params,
+        const ggml_tensor * kernel,  // [KW, KH, IC, OC] - fp32
+        const ggml_tensor * src,     // [W, H, C, N]
+        ggml_tensor * dst) {         // [OW, OH, OC, N]
 
     GGML_ASSERT(ggml_is_contiguous(kernel));
+    GGML_ASSERT(kernel->type == GGML_TYPE_F32);
 
-    const int32_t stride_x = dst->op_params[0];
-    const int32_t stride_y = dst->op_params[1];
-    const int32_t pad_x    = dst->op_params[2];
-    const int32_t pad_y    = dst->op_params[3];
+    const int32_t stride_x   = dst->op_params[0];
+    const int32_t stride_y   = dst->op_params[1];
+    const int32_t pad_x      = dst->op_params[2];
+    const int32_t pad_y      = dst->op_params[3];
+    const int32_t dilation_x = dst->op_params[4];
+    const int32_t dilation_y = dst->op_params[5];
 
     const int64_t c_in  = src->ne[2];
     const int64_t c_out = kernel->ne[3];
@@ -6616,173 +6619,93 @@ static void ggml_compute_forward_conv_2d_f32(const ggml_compute_params * params,
     const int64_t dst_w = dst->ne[0];
     const int64_t dst_h = dst->ne[1];
 
-
-    float * src_data = (float *) src->data;
-    float * knl_data = (float *) kernel->data;
-    float * dst_data = (      float *) dst->data;
-
+    float * src_data = (float*) src->data;
+    float * knl_data = (float*) kernel->data;
+    float * dst_data = (float*) dst->data;
 
     const int64_t knl_n           = knl_w * knl_h * c_in;
     const int64_t patch_total     = dst->ne[3] * dst_w * dst_h;
-    
-
-    
-    const int64_t space_per_patch = knl_n * sizeof(float) + patch_total * c_out * sizeof(float);
 
-    const int64_t batch_size = params->wsize / space_per_patch;
+    const int64_t space_per_patch   = knl_n * sizeof(float) + c_out * sizeof(float);
+    const int64_t batch_size        = params->wsize / space_per_patch;
     const int64_t patches_per_batch = batch_size > 8 ? (batch_size / 8) * 8 : batch_size;
-    const int64_t batch_n = (patch_total + patches_per_batch - 1) / patches_per_batch;
-
+    const int64_t batch_n           = (patch_total + patches_per_batch - 1) / patches_per_batch;
 
     GGML_ASSERT(patches_per_batch > 0 && batch_size >= 1);
 
-    float * tmp = (float *) params->wdata;          // per-thread scratch
+    float * tmp = (float *) params->wdata;
 
     for (int64_t batch_i = 0; batch_i < batch_n; ++batch_i) {
 
         const int64_t patch_start_batch = batch_i * patches_per_batch;
         const int64_t patch_end_batch   = std::min(patch_start_batch + patches_per_batch,
                                               patch_total);
-        const int64_t patch_n = patch_end_batch - patch_start_batch;
+        const int64_t patch_n           = patch_end_batch - patch_start_batch;
 
-        const int64_t patch_per_thread =
-            (patch_n + params->nth - 1) / params->nth;
-        const int64_t patch_start = patch_start_batch +
-                                    params->ith * patch_per_thread;
-        const int64_t patch_end   = std::min(patch_start + patch_per_thread,
-                                        patch_end_batch);
+        const int64_t patch_per_thread  = (patch_n + params->nth - 1) / params->nth;
+        const int64_t patch_start       = patch_start_batch + params->ith * patch_per_thread;
+        const int64_t patch_end         = std::min(patch_start + patch_per_thread,patch_end_batch);
 
         //im2col for a patch
         for (int64_t p = patch_start; p < patch_end; ++p) {
-            const int64_t  b     =  p / (dst_w * dst_h);
-            const int64_t  dy    = (p / dst_w) % dst_h;
-            const int64_t  dx    =  p % dst_w;
+            const int64_t  batch_n     =  p / (dst_w * dst_h);
+            const int64_t  src_x       = (p / dst_w) % dst_h;
+            const int64_t  src_y       =  p % dst_w;
 
-            const float  * src_base = (const float *)((char *)src_data + b * src->nb[3]);
-            float        * out_row = tmp + (p % patches_per_batch) * knl_n;
+            float * src_base = (float *)((char *)src_data + batch_n * src->nb[3]);
+            float * dst_row  = tmp + (p % patches_per_batch) * knl_n;
 
-            // Extract patch in IC,KH,KW order (same as im2col)
             for (int64_t ic = 0; ic < c_in; ++ic) {
                 for (int64_t ky = 0; ky < knl_h; ++ky) {
                     for (int64_t kx = 0; kx < knl_w; ++kx) {
-                        const int64_t sy = dy * stride_y + ky - pad_y;
-                        const int64_t sx = dx * stride_x + kx - pad_x;
-                        
+                        const int64_t sy = src_x * stride_y + ky * dilation_y - pad_y;
+                        const int64_t sx = src_y * stride_x + kx * dilation_x - pad_x;
+
                         int64_t dst_idx = ic * (knl_h * knl_w) + ky * knl_w + kx;
-                        
+
                         if (sy < 0 || sy >= src_h || sx < 0 || sx >= src_w) {
-                            out_row[dst_idx] = 0.0f;
+                            dst_row[dst_idx] = 0.0f;
                         } else {
-                            float * src_ptr = (float *)((char *)src_base + 
-                                sx * src->nb[0] + sy * src->nb[1] + ic * src->nb[2]);
-                            out_row[dst_idx] = *src_ptr;
+                            float * src_ptr = (float *)((char *)src_base + sx * src->nb[0] + sy * src->nb[1] + ic * src->nb[2]);
+                            dst_row[dst_idx] = *src_ptr;
                         }
                     }
                 }
             }
         }   // patches handled by this thread
 
-        ggml_barrier(params->threadpool);   // wait for all threads
+        ggml_barrier(params->threadpool);
 
-        //GEMM output is patch_n * cout 
         float * gemm_output = tmp + patches_per_batch * knl_n;
-        
+
         // GEMM: patches[patch_n, knl_n] × kernel[knl_n, c_out] = output[patch_n, c_out]
         ggml_call_mul_mat(params, patch_n, c_out, knl_n,
                             tmp, knl_data, gemm_output);
-        
-        // Barrier to ensure GEMM completes before permutation
+
         ggml_barrier(params->threadpool);
-        
-        // Distribute permutation work across threads
+
+
+        //permute back [OC, N, OH, OW] to [N, OC, OH, OW]
         const int64_t permute_per_thread = (patch_n + params->nth - 1) / params->nth;
         const int64_t permute_start = params->ith * permute_per_thread;
         const int64_t permute_end = std::min(permute_start + permute_per_thread, patch_n);
-        
-        // Each thread handles part of the permutation from [patch_n, c_out] to WHCN layout
+
         for (int64_t i = permute_start; i < permute_end; ++i) {
-            const int64_t p = patch_start_batch + i;
-            const int64_t b  = p / (dst_w * dst_h);         // batch index
-            const int64_t dy = (p / dst_w) % dst_h;         // height index  
-            const int64_t dx = p % dst_w;                   // width index
-            
-            // Copy all channels for this spatial position
+            const int64_t p       = patch_start_batch + i;
+            const int64_t batch_n = p / (dst_w * dst_h);
+            const int64_t dst_y   = (p / dst_w) % dst_h;
+            const int64_t dst_x   = p % dst_w;
+
             for (int64_t oc = 0; oc < c_out; ++oc) {
                 const float value = gemm_output[i * c_out + oc];
                 // Write to WHCN layout: dst[w, h, c, n]
-                float * dst_ptr = (float *)((char *)dst_data + 
-                    dx * dst->nb[0] + dy * dst->nb[1] + oc * dst->nb[2] + b * dst->nb[3]);
+                float * dst_ptr = (float *)((char *)dst_data + dst_x * dst->nb[0] + dst_y * dst->nb[1] + oc * dst->nb[2] + batch_n * dst->nb[3]);
                 *dst_ptr = value;
             }
         }
     }
 }
 
-static void ggml_compute_forward_conv_2d_f16(
-        const ggml_compute_params * params,
-        const ggml_tensor * kernel,  // [KW, KH, IC, OC]
-        const ggml_tensor * src,     // [W, H, C, N]
-        ggml_tensor * dst) {         // [OW, OH, OC, N]
-
-    const int32_t s0 = ggml_get_op_params_i32(dst, 0);
-    const int32_t s1 = ggml_get_op_params_i32(dst, 1);
-    const int32_t p0 = ggml_get_op_params_i32(dst, 2);
-    const int32_t p1 = ggml_get_op_params_i32(dst, 3);
-    const int32_t d0 = ggml_get_op_params_i32(dst, 4);
-    const int32_t d1 = ggml_get_op_params_i32(dst, 5);
-
-    const int64_t OW = dst->ne[0];
-    const int64_t OH = dst->ne[1];
-    const int64_t OC = dst->ne[2];
-    const int64_t N  = dst->ne[3];
-
-    const int64_t IW = src->ne[0];
-    const int64_t IH = src->ne[1];
-    const int64_t IC = src->ne[2];
-
-    const int64_t KW = kernel->ne[0];
-    const int64_t KH = kernel->ne[1];
-
-    const ggml_fp16_t * kernel_data = (const ggml_fp16_t *)kernel->data;
-    const ggml_fp16_t * src_data    = (const ggml_fp16_t *)src->data;
-    ggml_fp16_t       * dst_data    = (ggml_fp16_t       *)dst->data;
-
-    const int64_t rows_total = OH * N;
-    const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth;
-    const int64_t row_start = params->ith * rows_per_thread;
-    const int64_t row_end = MIN(row_start + rows_per_thread, rows_total);
-
-    for (int64_t row = row_start; row < row_end; ++row) {
-        const int64_t oh = row % OH;
-        const int64_t n  = row / OH;
-        const ggml_fp16_t * src_batch = src_data + n * IW * IH * IC;
-
-        for (int64_t ow = 0; ow < OW; ++ow) {
-            for (int64_t oc = 0; oc < OC; ++oc) {
-                float sum = 0.0f;
-                const ggml_fp16_t * kernel_channel = kernel_data + oc * KW * KH * IC;
-                for (int64_t kh = 0; kh < KH; ++kh) {
-                    const int64_t ih = oh * s1 - p1 + kh * d1;
-                    if (ih < 0 || ih >= IH) continue;
-
-                    for (int64_t kw = 0; kw < KW; ++kw) {
-                        const int64_t iw = ow * s0 - p0 + kw * d0;
-                        if (iw < 0 || iw >= IW) continue;
-
-                        for (int64_t ic = 0; ic < IC; ++ic) {
-                            const ggml_fp16_t * kernel_ptr = kernel_channel + (kh * KW + kw) + ic * KW * KH;
-                            const ggml_fp16_t * src_ptr = src_batch + (ih * IW + iw) + ic * IW * IH;
-                            sum += GGML_FP16_TO_FP32(*kernel_ptr) * GGML_FP16_TO_FP32(*src_ptr);
-                        }
-                    }
-                }
-
-                dst_data[((n * OC + oc) * OH + oh) * OW + ow] = GGML_FP32_TO_FP16(sum);
-            }
-        }
-    }
-}
-
 void ggml_compute_forward_conv_2d(
         const ggml_compute_params * params,
         ggml_tensor * dst) {
@@ -6790,19 +6713,10 @@ void ggml_compute_forward_conv_2d(
     const ggml_tensor * src0 = dst->src[0];
     const ggml_tensor * src1 = dst->src[1];
 
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_conv_2d_f16(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_conv_2d_f32(params, dst);
-            } break;
-        default:
-            {
-                GGML_ABORT("fatal error");
-            }
+    if (src0->type == GGML_TYPE_F16) {
+        GGML_ASSERT(false && "F16 not supported yet");
+    } else {
+        ggml_compute_forward_conv_2d_f32(params, src0, src1, dst);
     }
 }
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index fc1557a2d4065..517cc7a945e25 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -195,6 +195,7 @@ endif()
 # llama_build_and_test(test-opt.cpp) # SLOW
 llama_build_and_test(test-gguf.cpp)
 llama_build_and_test(test-backend-ops.cpp)
+llama_build_and_test(test_conv2d_comparison.cpp)
 
 llama_build_and_test(test-model-load-cancel.cpp  LABEL "model")
 llama_build_and_test(test-autorelease.cpp        LABEL "model")

From 4b1e71a98864f29b3cc3b87d99cf847efc6e8374 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Thu, 26 Jun 2025 17:54:12 +0800
Subject: [PATCH 4/8] remove file

---
 ggml/src/ggml-cpu/ggml-cpu.c | 2 --
 tests/CMakeLists.txt         | 1 -
 2 files changed, 3 deletions(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index edf8222843220..219e87770b65e 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2758,8 +2758,6 @@ struct ggml_cplan ggml_graph_plan(
                 case GGML_OP_CONV_2D:
                     {
                         cur = GGML_IM2COL_WORK_SIZE;
-                        //Add enough space for kernel transpose
-                        cur += sizeof(ggml_fp16_t)*node->src[1]->ne[0]*node->src[1]->ne[1]*node->src[1]->ne[2]*node->src[1]->ne[3];
                     } break;
                 case GGML_OP_CONV_TRANSPOSE_2D:
                     {
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 517cc7a945e25..fc1557a2d4065 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -195,7 +195,6 @@ endif()
 # llama_build_and_test(test-opt.cpp) # SLOW
 llama_build_and_test(test-gguf.cpp)
 llama_build_and_test(test-backend-ops.cpp)
-llama_build_and_test(test_conv2d_comparison.cpp)
 
 llama_build_and_test(test-model-load-cancel.cpp  LABEL "model")
 llama_build_and_test(test-autorelease.cpp        LABEL "model")

From aed4e1ffab02de6a831af422da6d66f63f3b24cb Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Thu, 26 Jun 2025 22:59:58 +0800
Subject: [PATCH 5/8] Fix tests

---
 ggml/src/ggml-cpu/ggml-cpu.c | 4 ----
 ggml/src/ggml-cpu/ops.h      | 2 +-
 ggml/src/ggml.c              | 1 +
 3 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 219e87770b65e..11ff228f07a44 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -687,10 +687,6 @@ static void ggml_init_arm_arch_features(void) {
 
 #endif // __ARM_ARCH
 
-void ggml_compute_forward_mul_mat(
-        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst);
-
 struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
     GGML_ASSERT(!ggml_get_no_alloc(ctx));
 
diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
index 417f49074f4ed..3a32ec20dba2b 100644
--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
@@ -21,7 +21,7 @@
 static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
 
 // Work buffer size for im2col operations in CONV2D
-#define GGML_IM2COL_WORK_SIZE (16 * 1024 * 1024)  // 16MB work buffer
+#define GGML_IM2COL_WORK_SIZE (16 * 1024 * 1024)
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index e4bdd6efa6aad..7d407315fea78 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -945,6 +945,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "CONV_TRANSPOSE_1D",
     "IM2COL",
     "IM2COL_BACK",
+    "CONV_2D",
     "CONV_2D_DW",
     "CONV_TRANSPOSE_2D",
     "POOL_1D",

From 966aa76c1c24afbfd308a04265d8a75b2e894912 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Sat, 28 Jun 2025 17:10:23 +0800
Subject: [PATCH 6/8] Support F16 operations

---
 ggml/src/ggml-cpu/ops.cpp | 65 ++++++++++++++++++++-------------------
 1 file changed, 34 insertions(+), 31 deletions(-)

diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 31310914a6799..5839945b405a3 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -6546,29 +6546,29 @@ void ggml_compute_forward_im2col_back_f32(
     }
 }
 
-static void ggml_call_mul_mat(
-    const ggml_compute_params * params,
-    int64_t m, int64_t n, int64_t k,
-    void * a, void * b, void * c) {
-
+static void ggml_call_mul_mat(ggml_type T, const ggml_compute_params * params, int64_t m, int64_t n, int64_t k,
+                              void * a, void * b, void * c) {
+    const ggml_type_traits * traits = ggml_get_type_traits(T);
     struct ggml_tensor src1 = {};
+    src1.type  = T;
     src1.ne[0] = k;
     src1.ne[1] = m;
     src1.ne[2] = 1;
     src1.ne[3] = 1;
-    src1.nb[0] = sizeof(float);
-    src1.nb[1] = k * sizeof(float);
+    src1.nb[0] = traits->type_size;
+    src1.nb[1] = k * traits->type_size;
     src1.nb[2] = src1.nb[1];
     src1.nb[3] = src1.nb[2];
     src1.data  = a;
 
     struct ggml_tensor src0 = {};
+    src0.type  = T;
     src0.ne[0] = k;
     src0.ne[1] = n;
     src0.ne[2] = 1;
     src0.ne[3] = 1;
-    src0.nb[0] = sizeof(float);
-    src0.nb[1] = k * sizeof(float);
+    src0.nb[0] = traits->type_size;
+    src0.nb[1] = k * traits->type_size;
     src0.nb[2] = src0.nb[1];
     src0.nb[3] = src0.nb[2];
     src0.data  = b;
@@ -6589,17 +6589,18 @@ static void ggml_call_mul_mat(
     ggml_compute_forward_mul_mat(params, &dst);
 }
 
-
 // ggml_compute_forward_conv_2d
 
-static void ggml_compute_forward_conv_2d_f32(
-        const ggml_compute_params * params,
-        const ggml_tensor * kernel,  // [KW, KH, IC, OC] - fp32
-        const ggml_tensor * src,     // [W, H, C, N]
-        ggml_tensor * dst) {         // [OW, OH, OC, N]
+static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params,
+                                              const ggml_tensor *         kernel,  // [KW, KH, IC, OC]
+                                              const ggml_tensor *         src,     // [W, H, C, N]
+                                              ggml_tensor *               dst,     // [OW, OH, OC, N]
+                                              ggml_type                   kernel_type) {
 
     GGML_ASSERT(ggml_is_contiguous(kernel));
-    GGML_ASSERT(kernel->type == GGML_TYPE_F32);
+    GGML_ASSERT(kernel->type == kernel_type);
+
+    const ggml_type_traits * traits = ggml_get_type_traits(kernel_type);
 
     const int32_t stride_x   = dst->op_params[0];
     const int32_t stride_y   = dst->op_params[1];
@@ -6620,20 +6621,20 @@ static void ggml_compute_forward_conv_2d_f32(
     const int64_t dst_h = dst->ne[1];
 
     float * src_data = (float*) src->data;
-    float * knl_data = (float*) kernel->data;
+    void  * knl_data = kernel->data;
     float * dst_data = (float*) dst->data;
 
     const int64_t knl_n           = knl_w * knl_h * c_in;
     const int64_t patch_total     = dst->ne[3] * dst_w * dst_h;
 
-    const int64_t space_per_patch   = knl_n * sizeof(float) + c_out * sizeof(float);
+    const int64_t space_per_patch   = knl_n * traits->type_size + c_out * sizeof(float);
     const int64_t batch_size        = params->wsize / space_per_patch;
     const int64_t patches_per_batch = batch_size > 8 ? (batch_size / 8) * 8 : batch_size;
     const int64_t batch_n           = (patch_total + patches_per_batch - 1) / patches_per_batch;
 
     GGML_ASSERT(patches_per_batch > 0 && batch_size >= 1);
 
-    float * tmp = (float *) params->wdata;
+    void * tmp = params->wdata;
 
     for (int64_t batch_i = 0; batch_i < batch_n; ++batch_i) {
 
@@ -6653,7 +6654,7 @@ static void ggml_compute_forward_conv_2d_f32(
             const int64_t  src_y       =  p % dst_w;
 
             float * src_base = (float *)((char *)src_data + batch_n * src->nb[3]);
-            float * dst_row  = tmp + (p % patches_per_batch) * knl_n;
+            char *  dst_row  = (char *) tmp + (p % patches_per_batch) * knl_n * traits->type_size;
 
             for (int64_t ic = 0; ic < c_in; ++ic) {
                 for (int64_t ky = 0; ky < knl_h; ++ky) {
@@ -6663,11 +6664,19 @@ static void ggml_compute_forward_conv_2d_f32(
 
                         int64_t dst_idx = ic * (knl_h * knl_w) + ky * knl_w + kx;
 
+                        float src_val;
                         if (sy < 0 || sy >= src_h || sx < 0 || sx >= src_w) {
-                            dst_row[dst_idx] = 0.0f;
+                            src_val = 0.0f;
                         } else {
                             float * src_ptr = (float *)((char *)src_base + sx * src->nb[0] + sy * src->nb[1] + ic * src->nb[2]);
-                            dst_row[dst_idx] = *src_ptr;
+                            src_val         = *src_ptr;
+                        }
+
+                        char * element_ptr = dst_row + dst_idx * traits->type_size;
+                        if (kernel_type == GGML_TYPE_F32) {
+                            *(float *) element_ptr = src_val;
+                        } else if (kernel_type == GGML_TYPE_F16) {
+                            *(ggml_fp16_t *) element_ptr = GGML_FP32_TO_FP16(src_val);
                         }
                     }
                 }
@@ -6676,11 +6685,10 @@ static void ggml_compute_forward_conv_2d_f32(
 
         ggml_barrier(params->threadpool);
 
-        float * gemm_output = tmp + patches_per_batch * knl_n;
+        float * gemm_output = (float *) ((char *) tmp + patches_per_batch * knl_n * traits->type_size);
 
         // GEMM: patches[patch_n, knl_n] × kernel[knl_n, c_out] = output[patch_n, c_out]
-        ggml_call_mul_mat(params, patch_n, c_out, knl_n,
-                            tmp, knl_data, gemm_output);
+        ggml_call_mul_mat(kernel_type, params, patch_n, c_out, knl_n, tmp, knl_data, gemm_output);
 
         ggml_barrier(params->threadpool);
 
@@ -6698,7 +6706,6 @@ static void ggml_compute_forward_conv_2d_f32(
 
             for (int64_t oc = 0; oc < c_out; ++oc) {
                 const float value = gemm_output[i * c_out + oc];
-                // Write to WHCN layout: dst[w, h, c, n]
                 float * dst_ptr = (float *)((char *)dst_data + dst_x * dst->nb[0] + dst_y * dst->nb[1] + oc * dst->nb[2] + batch_n * dst->nb[3]);
                 *dst_ptr = value;
             }
@@ -6713,11 +6720,7 @@ void ggml_compute_forward_conv_2d(
     const ggml_tensor * src0 = dst->src[0];
     const ggml_tensor * src1 = dst->src[1];
 
-    if (src0->type == GGML_TYPE_F16) {
-        GGML_ASSERT(false && "F16 not supported yet");
-    } else {
-        ggml_compute_forward_conv_2d_f32(params, src0, src1, dst);
-    }
+    ggml_compute_forward_conv_2d_impl(params, src0, src1, dst, src0->type);
 }
 
 // ggml_compute_forward_conv_transpose_2d

From 29d77dc03cf59b45386783703f3f32e123c72f99 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Sun, 29 Jun 2025 11:27:35 +0800
Subject: [PATCH 7/8] add assert about size

---
 ggml/src/ggml-cpu/ops.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 5839945b405a3..39eb2e3aadfef 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -6645,7 +6645,7 @@ static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params
 
         const int64_t patch_per_thread  = (patch_n + params->nth - 1) / params->nth;
         const int64_t patch_start       = patch_start_batch + params->ith * patch_per_thread;
-        const int64_t patch_end         = std::min(patch_start + patch_per_thread,patch_end_batch);
+        const int64_t patch_end         = std::min(patch_start + patch_per_thread, patch_end_batch);
 
         //im2col for a patch
         for (int64_t p = patch_start; p < patch_end; ++p) {
@@ -6687,6 +6687,8 @@ static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params
 
         float * gemm_output = (float *) ((char *) tmp + patches_per_batch * knl_n * traits->type_size);
 
+        GGML_ASSERT(gemm_output + patch_n * c_out <= (float*)tmp + params->wsize);
+
         // GEMM: patches[patch_n, knl_n] × kernel[knl_n, c_out] = output[patch_n, c_out]
         ggml_call_mul_mat(kernel_type, params, patch_n, c_out, knl_n, tmp, knl_data, gemm_output);
 

From 808f526b93124ed5b5470aa9cc0610a04670dcc2 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Mon, 30 Jun 2025 22:27:01 +0800
Subject: [PATCH 8/8] Review: further formatting fixes, add assert and use CPU
 version of fp32->fp16

---
 ggml/src/ggml-cpu/ops.cpp | 27 ++++++++++++++-------------
 ggml/src/ggml.c           |  4 ++--
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 39eb2e3aadfef..6948c00b44a19 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -6546,11 +6546,11 @@ void ggml_compute_forward_im2col_back_f32(
     }
 }
 
-static void ggml_call_mul_mat(ggml_type T, const ggml_compute_params * params, int64_t m, int64_t n, int64_t k,
-                              void * a, void * b, void * c) {
-    const ggml_type_traits * traits = ggml_get_type_traits(T);
+static void ggml_call_mul_mat(ggml_type type, const ggml_compute_params * params, int64_t m, int64_t n, int64_t k,
+                              void * a, void * b, float * c) {
+    const ggml_type_traits * traits = ggml_get_type_traits(type);
     struct ggml_tensor src1 = {};
-    src1.type  = T;
+    src1.type  = type;
     src1.ne[0] = k;
     src1.ne[1] = m;
     src1.ne[2] = 1;
@@ -6562,7 +6562,7 @@ static void ggml_call_mul_mat(ggml_type T, const ggml_compute_params * params, i
     src1.data  = a;
 
     struct ggml_tensor src0 = {};
-    src0.type  = T;
+    src0.type  = type;
     src0.ne[0] = k;
     src0.ne[1] = n;
     src0.ne[2] = 1;
@@ -6598,6 +6598,7 @@ static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params
                                               ggml_type                   kernel_type) {
 
     GGML_ASSERT(ggml_is_contiguous(kernel));
+    GGML_ASSERT(kernel_type == GGML_TYPE_F16 || kernel_type == GGML_TYPE_F32);
     GGML_ASSERT(kernel->type == kernel_type);
 
     const ggml_type_traits * traits = ggml_get_type_traits(kernel_type);
@@ -6620,9 +6621,9 @@ static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params
     const int64_t dst_w = dst->ne[0];
     const int64_t dst_h = dst->ne[1];
 
-    float * src_data = (float*) src->data;
-    void  * knl_data = kernel->data;
-    float * dst_data = (float*) dst->data;
+    const float * src_data = (float *) src->data;
+    void  * knl_data       = kernel->data;
+    float * dst_data       = (float *) dst->data;
 
     const int64_t knl_n           = knl_w * knl_h * c_in;
     const int64_t patch_total     = dst->ne[3] * dst_w * dst_h;
@@ -6653,8 +6654,8 @@ static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params
             const int64_t  src_x       = (p / dst_w) % dst_h;
             const int64_t  src_y       =  p % dst_w;
 
-            float * src_base = (float *)((char *)src_data + batch_n * src->nb[3]);
-            char *  dst_row  = (char *) tmp + (p % patches_per_batch) * knl_n * traits->type_size;
+            const float * src_base = (const float *)((const char *)src_data + batch_n * src->nb[3]);
+            char *        dst_row  = (char *) tmp + (p % patches_per_batch) * knl_n * traits->type_size;
 
             for (int64_t ic = 0; ic < c_in; ++ic) {
                 for (int64_t ky = 0; ky < knl_h; ++ky) {
@@ -6668,15 +6669,15 @@ static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params
                         if (sy < 0 || sy >= src_h || sx < 0 || sx >= src_w) {
                             src_val = 0.0f;
                         } else {
-                            float * src_ptr = (float *)((char *)src_base + sx * src->nb[0] + sy * src->nb[1] + ic * src->nb[2]);
-                            src_val         = *src_ptr;
+                            const float * src_ptr = (const float *)((const char *)src_base + sx * src->nb[0] + sy * src->nb[1] + ic * src->nb[2]);
+                            src_val               = *src_ptr;
                         }
 
                         char * element_ptr = dst_row + dst_idx * traits->type_size;
                         if (kernel_type == GGML_TYPE_F32) {
                             *(float *) element_ptr = src_val;
                         } else if (kernel_type == GGML_TYPE_F16) {
-                            *(ggml_fp16_t *) element_ptr = GGML_FP32_TO_FP16(src_val);
+                            *(ggml_fp16_t *) element_ptr = GGML_CPU_FP32_TO_FP16(src_val);
                         }
                     }
                 }
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 7d407315fea78..47fe37b13e07d 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -987,7 +987,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "GLU",
 };
 
-static_assert(GGML_OP_COUNT == 85, "GGML_OP_COUNT != 85");
+static_assert(GGML_OP_COUNT == 86, "GGML_OP_COUNT != 86");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -1087,7 +1087,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "glu(x)",
 };
 
-static_assert(GGML_OP_COUNT == 85, "GGML_OP_COUNT != 85");
+static_assert(GGML_OP_COUNT == 86, "GGML_OP_COUNT != 86");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");