implement swapped variants (cpu/cuda)

CISC · web-flow · commit 39eba35c22c4 · 2025-06-13T22:48:53.000+02:00
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
@@ -1100,23 +1100,37 @@ extern "C" {
     // gated linear unit ops
     // A: n columns, r rows,
     // result is n / 2 columns, r rows,
+    // expects gate in second half of row, unless swapped is true
     GGML_API struct ggml_tensor * ggml_glu(
             struct ggml_context * ctx,
              struct ggml_tensor * a,
-             enum ggml_glu_op op);
+             enum ggml_glu_op     op,
+             bool                 swapped);
 
     GGML_API struct ggml_tensor * ggml_reglu(
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
 
+    GGML_API struct ggml_tensor * ggml_reglu_swapped(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
     GGML_API struct ggml_tensor * ggml_geglu(
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
 
+    GGML_API struct ggml_tensor * ggml_geglu_swapped(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
     GGML_API struct ggml_tensor * ggml_swiglu(
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
 
+    GGML_API struct ggml_tensor * ggml_swiglu_swapped(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
     // normalize along rows
     GGML_API struct ggml_tensor * ggml_norm(
             struct ggml_context * ctx,
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
@@ -3214,6 +3214,8 @@ static void ggml_compute_forward_reglu_f32(
     GGML_ASSERT(dst->ne[0] == nc);
     GGML_ASSERT(ggml_nrows(dst) == nr);
 
+    const int32_t swapped = ggml_get_op_params_i32(dst, 1);
+
     // rows per thread
     const int dr = (nr + nth - 1)/nth;
 
@@ -3224,7 +3226,8 @@ static void ggml_compute_forward_reglu_f32(
     for (int i1 = ir0; i1 < ir1; i1++) {
         ggml_vec_reglu_f32(nc,
                 (float *) ((char *) dst->data  + i1*( dst->nb[1])),
-                (float *) ((char *) src0->data + i1*(src0->nb[1])));
+                (float *) ((char *) src0->data + i1*(src0->nb[1])) + (swapped ? nc : 0),
+                (float *) ((char *) src0->data + i1*(src0->nb[1])) + (swapped ? 0 : nc));
 
 #ifndef NDEBUG
         for (int k = 0; k < nc; k++) {
@@ -3255,6 +3258,8 @@ static void ggml_compute_forward_reglu_f16(
     GGML_ASSERT(dst->ne[0] == nc);
     GGML_ASSERT(ggml_nrows(dst) == nr);
 
+    const int32_t swapped = ggml_get_op_params_i32(dst, 1);
+
     // rows per thread
     const int dr = (nr + nth - 1)/nth;
 
@@ -3265,7 +3270,8 @@ static void ggml_compute_forward_reglu_f16(
     for (int i1 = ir0; i1 < ir1; i1++) {
         ggml_vec_reglu_f16(nc,
                 (ggml_fp16_t *) ((char *) dst->data  + i1*( dst->nb[1])),
-                (ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1])));
+                (ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1])) + (swapped ? nc : 0),
+                (ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1])) + (swapped ? 0 : nc));
 
 #ifndef NDEBUG
         for (int k = 0; k < nc; k++) {
@@ -3321,6 +3327,8 @@ static void ggml_compute_forward_geglu_f32(
     GGML_ASSERT(dst->ne[0] == nc);
     GGML_ASSERT(ggml_nrows(dst) == nr);
 
+    const int32_t swapped = ggml_get_op_params_i32(dst, 1);
+
     // rows per thread
     const int dr = (nr + nth - 1)/nth;
 
@@ -3331,7 +3339,8 @@ static void ggml_compute_forward_geglu_f32(
     for (int i1 = ir0; i1 < ir1; i1++) {
         ggml_vec_geglu_f32(nc,
                 (float *) ((char *) dst->data  + i1*( dst->nb[1])),
-                (float *) ((char *) src0->data + i1*(src0->nb[1])));
+                (float *) ((char *) src0->data + i1*(src0->nb[1])) + (swapped ? nc : 0),
+                (float *) ((char *) src0->data + i1*(src0->nb[1])) + (swapped ? 0 : nc));
 
 #ifndef NDEBUG
         for (int k = 0; k < nc; k++) {
@@ -3362,6 +3371,8 @@ static void ggml_compute_forward_geglu_f16(
     GGML_ASSERT(dst->ne[0] == nc);
     GGML_ASSERT(ggml_nrows(dst) == nr);
 
+    const int32_t swapped = ggml_get_op_params_i32(dst, 1);
+
     // rows per thread
     const int dr = (nr + nth - 1)/nth;
 
@@ -3372,7 +3383,8 @@ static void ggml_compute_forward_geglu_f16(
     for (int i1 = ir0; i1 < ir1; i1++) {
         ggml_vec_geglu_f16(nc,
                 (ggml_fp16_t *) ((char *) dst->data  + i1*( dst->nb[1])),
-                (ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1])));
+                (ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1])) + (swapped ? nc : 0),
+                (ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1])) + (swapped ? 0 : nc));
 
 #ifndef NDEBUG
         for (int k = 0; k < nc; k++) {
@@ -3428,6 +3440,8 @@ static void ggml_compute_forward_swiglu_f32(
     GGML_ASSERT(dst->ne[0] == nc);
     GGML_ASSERT(ggml_nrows(dst) == nr);
 
+    const int32_t swapped = ggml_get_op_params_i32(dst, 1);
+
     // rows per thread
     const int dr = (nr + nth - 1)/nth;
 
@@ -3438,7 +3452,8 @@ static void ggml_compute_forward_swiglu_f32(
     for (int i1 = ir0; i1 < ir1; i1++) {
         ggml_vec_swiglu_f32(nc,
                 (float *) ((char *) dst->data  + i1*( dst->nb[1])),
-                (float *) ((char *) src0->data + i1*(src0->nb[1])));
+                (float *) ((char *) src0->data + i1*(src0->nb[1])) + (swapped ? nc : 0),
+                (float *) ((char *) src0->data + i1*(src0->nb[1])) + (swapped ? 0 : nc));
 
 #ifndef NDEBUG
         for (int k = 0; k < nc; k++) {
@@ -3469,6 +3484,8 @@ static void ggml_compute_forward_swiglu_f16(
     GGML_ASSERT(dst->ne[0] == nc);
     GGML_ASSERT(ggml_nrows(dst) == nr);
 
+    const int32_t swapped = ggml_get_op_params_i32(dst, 1);
+
     // rows per thread
     const int dr = (nr + nth - 1)/nth;
 
@@ -3479,7 +3496,8 @@ static void ggml_compute_forward_swiglu_f16(
     for (int i1 = ir0; i1 < ir1; i1++) {
         ggml_vec_swiglu_f16(nc,
                 (ggml_fp16_t *) ((char *) dst->data  + i1*( dst->nb[1])),
-                (ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1])));
+                (ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1])) + (swapped ? nc : 0),
+                (ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1])) + (swapped ? 0 : nc));
 
 #ifndef NDEBUG
         for (int k = 0; k < nc; k++) {
diff --git a/ggml/src/ggml-cpu/vec.cpp b/ggml/src/ggml-cpu/vec.cpp
@@ -254,27 +254,27 @@ void ggml_vec_silu_f32(const int n, float * y, const float * x) {
     }
 }
 
-void ggml_vec_swiglu_f32(const int n, float * y, const float * x) {
+void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g) {
     int i = 0;
 #if defined(__AVX512F__) && defined(__AVX512DQ__)
     for (; i + 15 < n; i += 16) {
-        _mm512_storeu_ps(y + i, _mm512_mul_ps(ggml_v_silu(_mm512_loadu_ps(x + i)), _mm512_loadu_ps(x + i + n)));
+        _mm512_storeu_ps(y + i, _mm512_mul_ps(ggml_v_silu(_mm512_loadu_ps(x + i)), _mm512_loadu_ps(g + i)));
     }
 #elif defined(__AVX2__) && defined(__FMA__)
     for (; i + 7 < n; i += 8) {
-        _mm256_storeu_ps(y + i, _mm256_mul_ps(ggml_v_silu(_mm256_loadu_ps(x + i)), _mm256_loadu_ps(x + i + n)));
+        _mm256_storeu_ps(y + i, _mm256_mul_ps(ggml_v_silu(_mm256_loadu_ps(x + i)), _mm256_loadu_ps(g + i)));
     }
 #elif defined(__SSE2__)
     for (; i + 3 < n; i += 4) {
-        _mm_storeu_ps(y + i, _mm_mul_ps(ggml_v_silu(_mm_loadu_ps(x + i)), _mm_loadu_ps(x + i + n)));
+        _mm_storeu_ps(y + i, _mm_mul_ps(ggml_v_silu(_mm_loadu_ps(x + i)), _mm_loadu_ps(g + i)));
     }
 #elif defined(__ARM_NEON) && defined(__aarch64__)
     for (; i + 3 < n; i += 4) {
-        vst1q_f32(y + i, vmulq_f32(ggml_v_silu(vld1q_f32(x + i)), vld1q_f32(x + i + n)));
+        vst1q_f32(y + i, vmulq_f32(ggml_v_silu(vld1q_f32(x + i)), vld1q_f32(g + i)));
     }
 #endif
     for (; i < n; ++i) {
-        y[i] = ggml_silu_f32(x[i]) * x[i + n];
+        y[i] = ggml_silu_f32(x[i]) * g[i];
     }
 }
 
diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h
@@ -905,57 +905,57 @@ inline static void ggml_vec_silu_backward_f16(const int n, ggml_fp16_t * dx, con
     }
 }
 
-inline static void ggml_vec_reglu_f32 (const int n, float * y, const float * x) {
+inline static void ggml_vec_reglu_f32 (const int n, float * y, const float * x, const float * g) {
     for (int i = 0; i < n; ++i) {
-        y[i] = (x[i] > 0.f) ? x[i] * x[i + n] : 0.f;
+        y[i] = (x[i] > 0.f) ? x[i] * g[i] : 0.f;
     }
 }
 
-inline static void ggml_vec_reglu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+inline static void ggml_vec_reglu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
     for (int i = 0; i < n; ++i) {
         float v = GGML_FP16_TO_FP32(x[i]);
-        y[i] = GGML_FP32_TO_FP16((v > 0.f) ? v * GGML_FP16_TO_FP32(x[i + n]) : 0.f);
+        y[i] = GGML_FP32_TO_FP16((v > 0.f) ? v * GGML_FP16_TO_FP32(g[i]) : 0.f);
     }
 }
 
 #ifdef GGML_GELU_FP16
-inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x) {
+inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
     uint16_t t;
     for (int i = 0; i < n; ++i) {
         if (x[i] <= -10.0f) {
             y[i] = 0.0f;
         } else if (x[i] >= 10.0f) {
-            y[i] = x[i] * x[i + n];
+            y[i] = x[i] * g[i];
         } else {
             ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
             memcpy(&t, &fp16, sizeof(uint16_t));
-            y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]) * x[i + n];
+            y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]) * g[i];
         }
     }
 }
 #else
-inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x) {
+inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
     for (int i = 0; i < n; ++i) {
-        y[i] = ggml_gelu_f32(x[i]) * x[i + n];
+        y[i] = ggml_gelu_f32(x[i]) * g[i];
     }
 }
 #endif
 
-inline static void ggml_vec_geglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+inline static void ggml_vec_geglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
     const uint16_t * i16 = (const uint16_t *) x;
     for (int i = 0; i < n; ++i) {
-        float g = GGML_FP16_TO_FP32(x[i + n]);
-        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(ggml_table_gelu_f16[i16[i]]) * g);
+        float v = GGML_FP16_TO_FP32(g[i]);
+        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(ggml_table_gelu_f16[i16[i]]) * v);
     }
 }
 
-void ggml_vec_swiglu_f32(const int n, float * y, const float * x);
+void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g);
 
-inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
     for (int i = 0; i < n; ++i) {
         float v = GGML_FP16_TO_FP32(x[i]);
-        float g = GGML_FP16_TO_FP32(x[i + n]);
-        y[i] = GGML_FP32_TO_FP16((v/(1.0f + expf(-v))) * g);
+        float w = GGML_FP16_TO_FP32(g[i]);
+        y[i] = GGML_FP32_TO_FP16((v/(1.0f + expf(-v))) * w);
     }
 }
 
diff --git a/ggml/src/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu
@@ -199,7 +199,7 @@ void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 /* gated ops */
 
 template <float (*op)(float), typename T>
-static __global__ void unary_gated_op_kernel(const T * x, T * dst, const int64_t k, const int64_t n, const int64_t o) {
+static __global__ void unary_gated_op_kernel(const T * x, const T * g, T * dst, const int64_t k, const int64_t n, const int64_t o) {
     const int64_t i = int64_t(blockDim.x)*blockIdx.x + threadIdx.x;
 
     if (i >= k) {
@@ -208,13 +208,13 @@ static __global__ void unary_gated_op_kernel(const T * x, T * dst, const int64_t
 
     // perform base op on first half of row and multiply with gate in second half
     const int64_t j = (i / n) * o + (i % n);
-    dst[i] = (T)(op((float)x[j]) * (float)x[j + n]);
+    dst[i] = (T)(op((float)x[j]) * (float)g[j]);
 }
 
 template <float (*op)(float), typename T>
-static void unary_gated_cuda(const T * x, T * dst, const int64_t k, const int64_t n, const int64_t o, cudaStream_t stream) {
+static void unary_gated_cuda(const T * x, const T * g, T * dst, const int64_t k, const int64_t n, const int64_t o, cudaStream_t stream) {
     const int64_t num_blocks = (k + CUDA_GLU_BLOCK_SIZE - 1) / CUDA_GLU_BLOCK_SIZE;
-    unary_gated_op_kernel<op><<<num_blocks, CUDA_GLU_BLOCK_SIZE, 0, stream>>>(x, dst, k, n, o);
+    unary_gated_op_kernel<op><<<num_blocks, CUDA_GLU_BLOCK_SIZE, 0, stream>>>(x, g, dst, k, n, o);
 }
 
 template <float (*op)(float)>
@@ -235,10 +235,26 @@ void ggml_cuda_op_unary_gated(ggml_backend_cuda_context & ctx, ggml_tensor * dst
     GGML_ASSERT(dst->ne[0] == nc);
     GGML_ASSERT(ggml_nrows(dst) == ggml_nrows(src0));
 
+    const int32_t swapped = ((const int32_t *) dst->op_params)[1];
+
     if (src0->type == GGML_TYPE_F16) {
-        unary_gated_cuda<op>((const half *)src0_d, (half *)dst_d, ggml_nelements(dst), nc, src0->nb[1] / sizeof(half), stream);
+        unary_gated_cuda<op>(
+                (const half *)src0_d + (swapped ? nc : 0),
+                (const half *)src0_d + (swapped ? 0 : nc),
+                (half *)dst_d,
+                ggml_nelements(dst),
+                nc,
+                src0->nb[1] / sizeof(half),
+                stream);
     } else {
-        unary_gated_cuda<op>((const float *)src0_d, (float *)dst_d, ggml_nelements(dst), nc, src0->nb[1] / sizeof(float), stream);
+        unary_gated_cuda<op>(
+                (const float *)src0_d + (swapped ? nc : 0),
+                (const float *)src0_d + (swapped ? 0 : nc),
+                (float *)dst_d,
+                ggml_nelements(dst),
+                nc,
+                src0->nb[1] / sizeof(float),
+                stream);
     }
 }
 
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -2647,13 +2647,15 @@ struct ggml_tensor * ggml_exp_inplace(
 struct ggml_tensor * ggml_glu(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
-        enum ggml_glu_op      op) {
+        enum ggml_glu_op      op,
+        bool                  swapped) {
     GGML_ASSERT(ggml_is_contiguous_1(a));
 
     int64_t ne[GGML_MAX_DIMS] = { a->ne[0] / 2 }; for (int i = 1; i < GGML_MAX_DIMS; i++) ne[i] = a->ne[i];
     struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, ne, NULL, 0);
 
     ggml_set_op_params_i32(result, 0, (int32_t) op);
+    ggml_set_op_params_i32(result, 1, (int32_t) swapped);
 
     result->op     = GGML_OP_GLU;
     result->src[0] = a;
@@ -2666,23 +2668,41 @@ struct ggml_tensor * ggml_glu(
 struct ggml_tensor * ggml_reglu(
         struct ggml_context * ctx,
         struct ggml_tensor  * a) {
-    return ggml_glu(ctx, a, GGML_GLU_OP_REGLU);
+    return ggml_glu(ctx, a, GGML_GLU_OP_REGLU, false);
+}
+
+struct ggml_tensor * ggml_reglu_swapped(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_glu(ctx, a, GGML_GLU_OP_REGLU, true);
 }
 
 // ggml_geglu
 
 struct ggml_tensor * ggml_geglu(
         struct ggml_context * ctx,
         struct ggml_tensor  * a) {
-    return ggml_glu(ctx, a, GGML_GLU_OP_GEGLU);
+    return ggml_glu(ctx, a, GGML_GLU_OP_GEGLU, false);
+}
+
+struct ggml_tensor * ggml_geglu_swapped(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_glu(ctx, a, GGML_GLU_OP_GEGLU, true);
 }
 
 // ggml_swiglu
 
 struct ggml_tensor * ggml_swiglu(
         struct ggml_context * ctx,
         struct ggml_tensor  * a) {
-    return ggml_glu(ctx, a, GGML_GLU_OP_SWIGLU);
+    return ggml_glu(ctx, a, GGML_GLU_OP_SWIGLU, false);
+}
+
+struct ggml_tensor * ggml_swiglu_swapped(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_glu(ctx, a, GGML_GLU_OP_SWIGLU, true);
 }
 
 // ggml_norm
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp

Original file line number	Diff line number	Diff line change
`@@ -254,27 +254,27 @@ void ggml_vec_silu_f32(const int n, float * y, const float * x) {`
`254`	`254`	`}`
`255`	`255`	`}`
`256`	`256`
`257`		`-void ggml_vec_swiglu_f32(const int n, float * y, const float * x) {`
	`257`	`+void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g) {`
`258`	`258`	`int i = 0;`
`259`	`259`	`#if defined(__AVX512F__) && defined(__AVX512DQ__)`
`260`	`260`	`for (; i + 15 < n; i += 16) {`
`261`		`- _mm512_storeu_ps(y + i, _mm512_mul_ps(ggml_v_silu(_mm512_loadu_ps(x + i)), _mm512_loadu_ps(x + i + n)));`
	`261`	`+ _mm512_storeu_ps(y + i, _mm512_mul_ps(ggml_v_silu(_mm512_loadu_ps(x + i)), _mm512_loadu_ps(g + i)));`
`262`	`262`	`}`
`263`	`263`	`#elif defined(__AVX2__) && defined(__FMA__)`
`264`	`264`	`for (; i + 7 < n; i += 8) {`
`265`		`- _mm256_storeu_ps(y + i, _mm256_mul_ps(ggml_v_silu(_mm256_loadu_ps(x + i)), _mm256_loadu_ps(x + i + n)));`
	`265`	`+ _mm256_storeu_ps(y + i, _mm256_mul_ps(ggml_v_silu(_mm256_loadu_ps(x + i)), _mm256_loadu_ps(g + i)));`
`266`	`266`	`}`
`267`	`267`	`#elif defined(__SSE2__)`
`268`	`268`	`for (; i + 3 < n; i += 4) {`
`269`		`- _mm_storeu_ps(y + i, _mm_mul_ps(ggml_v_silu(_mm_loadu_ps(x + i)), _mm_loadu_ps(x + i + n)));`
	`269`	`+ _mm_storeu_ps(y + i, _mm_mul_ps(ggml_v_silu(_mm_loadu_ps(x + i)), _mm_loadu_ps(g + i)));`
`270`	`270`	`}`
`271`	`271`	`#elif defined(__ARM_NEON) && defined(__aarch64__)`
`272`	`272`	`for (; i + 3 < n; i += 4) {`
`273`		`- vst1q_f32(y + i, vmulq_f32(ggml_v_silu(vld1q_f32(x + i)), vld1q_f32(x + i + n)));`
	`273`	`+ vst1q_f32(y + i, vmulq_f32(ggml_v_silu(vld1q_f32(x + i)), vld1q_f32(g + i)));`
`274`	`274`	`}`
`275`	`275`	`#endif`
`276`	`276`	`for (; i < n; ++i) {`
`277`		`- y[i] = ggml_silu_f32(x[i]) * x[i + n];`
	`277`	`+ y[i] = ggml_silu_f32(x[i]) * g[i];`
`278`	`278`	`}`
`279`	`279`	`}`
`280`	`280`
Original file line number	Diff line number	Diff line change
`@@ -905,57 +905,57 @@ inline static void ggml_vec_silu_backward_f16(const int n, ggml_fp16_t * dx, con`
`905`	`905`	`}`
`906`	`906`	`}`
`907`	`907`
`908`		`-inline static void ggml_vec_reglu_f32 (const int n, float * y, const float * x) {`
	`908`	`+inline static void ggml_vec_reglu_f32 (const int n, float * y, const float * x, const float * g) {`
`909`	`909`	`for (int i = 0; i < n; ++i) {`
`910`		`- y[i] = (x[i] > 0.f) ? x[i] * x[i + n] : 0.f;`
	`910`	`+ y[i] = (x[i] > 0.f) ? x[i] * g[i] : 0.f;`
`911`	`911`	`}`
`912`	`912`	`}`
`913`	`913`
`914`		`-inline static void ggml_vec_reglu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {`
	`914`	`+inline static void ggml_vec_reglu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {`
`915`	`915`	`for (int i = 0; i < n; ++i) {`
`916`	`916`	`float v = GGML_FP16_TO_FP32(x[i]);`
`917`		`- y[i] = GGML_FP32_TO_FP16((v > 0.f) ? v * GGML_FP16_TO_FP32(x[i + n]) : 0.f);`
	`917`	`+ y[i] = GGML_FP32_TO_FP16((v > 0.f) ? v * GGML_FP16_TO_FP32(g[i]) : 0.f);`
`918`	`918`	`}`
`919`	`919`	`}`
`920`	`920`
`921`	`921`	`#ifdef GGML_GELU_FP16`
`922`		`-inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x) {`
	`922`	`+inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {`
`923`	`923`	`uint16_t t;`
`924`	`924`	`for (int i = 0; i < n; ++i) {`
`925`	`925`	`if (x[i] <= -10.0f) {`
`926`	`926`	`y[i] = 0.0f;`
`927`	`927`	`} else if (x[i] >= 10.0f) {`
`928`		`- y[i] = x[i] * x[i + n];`
	`928`	`+ y[i] = x[i] * g[i];`
`929`	`929`	`} else {`
`930`	`930`	`ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);`
`931`	`931`	`memcpy(&t, &fp16, sizeof(uint16_t));`
`932`		`- y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]) * x[i + n];`
	`932`	`+ y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]) * g[i];`
`933`	`933`	`}`
`934`	`934`	`}`
`935`	`935`	`}`
`936`	`936`	`#else`
`937`		`-inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x) {`
	`937`	`+inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {`
`938`	`938`	`for (int i = 0; i < n; ++i) {`
`939`		`- y[i] = ggml_gelu_f32(x[i]) * x[i + n];`
	`939`	`+ y[i] = ggml_gelu_f32(x[i]) * g[i];`
`940`	`940`	`}`
`941`	`941`	`}`
`942`	`942`	`#endif`
`943`	`943`
`944`		`-inline static void ggml_vec_geglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {`
	`944`	`+inline static void ggml_vec_geglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {`
`945`	`945`	`const uint16_t * i16 = (const uint16_t *) x;`
`946`	`946`	`for (int i = 0; i < n; ++i) {`
`947`		`- float g = GGML_FP16_TO_FP32(x[i + n]);`
`948`		`- y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(ggml_table_gelu_f16[i16[i]]) * g);`
	`947`	`+ float v = GGML_FP16_TO_FP32(g[i]);`
	`948`	`+ y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(ggml_table_gelu_f16[i16[i]]) * v);`
`949`	`949`	`}`
`950`	`950`	`}`
`951`	`951`
`952`		`-void ggml_vec_swiglu_f32(const int n, float * y, const float * x);`
	`952`	`+void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g);`
`953`	`953`
`954`		`-inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {`
	`954`	`+inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {`
`955`	`955`	`for (int i = 0; i < n; ++i) {`
`956`	`956`	`float v = GGML_FP16_TO_FP32(x[i]);`
`957`		`- float g = GGML_FP16_TO_FP32(x[i + n]);`
`958`		`- y[i] = GGML_FP32_TO_FP16((v/(1.0f + expf(-v))) * g);`
	`957`	`+ float w = GGML_FP16_TO_FP32(g[i]);`
	`958`	`+ y[i] = GGML_FP32_TO_FP16((v/(1.0f + expf(-v))) * w);`
`959`	`959`	`}`
`960`	`960`	`}`
`961`	`961`