WIP: Very temporary attempt at a version of the SSM scan kernel that parallelizes over d_state

gabe-l-hart · gabe-l-hart · commit b4ab8e4d6a10 · 2025-07-15T17:04:31.000-06:00
This is extremely hacky! It also doesn't have any performance benefits yet

Branch: GraniteFourPerf

Signed-off-by: Gabe Goodhart &lt;ghart@us.ibm.com&gt;
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
@@ -1211,7 +1211,8 @@ @implementation GGMLMetalClass
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NORM,                            norm,                            true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_CONV_F32,                    ssm_conv_f32,                    true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32,                    ssm_scan_f32,                    true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32_GROUP,              ssm_scan_f32_group,              true);
+        // GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32_GROUP,              ssm_scan_f32_group,              true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32_GROUP,              ssm_scan_f32_group_GHART,              true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RWKV_WKV6_F32,                   rwkv_wkv6_f32,                   true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RWKV_WKV7_F32,                   rwkv_wkv7_f32,                   true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32,                  mul_mv_f32_f32,                  has_simdgroup_reduction);
@@ -2986,6 +2987,7 @@ static bool ggml_metal_encode_node(
                     /*.n_group      =*/ n_group,
                     /*.n_seq_tokens =*/ n_seq_tokens,
                     /*.n_seqs       =*/ n_seqs,
+                    /*.s_off        =*/ ggml_nelements(src1) * sizeof(float),
                     /*.nb01         =*/ nb01,
                     /*.nb02         =*/ nb02,
                     /*.nb03         =*/ nb03,
@@ -3016,7 +3018,8 @@ static bool ggml_metal_encode_node(
 
                 if (ne30 == 1) {
                     // Mamba-2
-                    [encoder dispatchThreadgroups:MTLSizeMake(d_inner, n_head, n_seqs) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                    [encoder setThreadgroupMemoryLength:d_state*sizeof(float) atIndex:0];
+                    [encoder dispatchThreadgroups:MTLSizeMake(d_inner, n_head, n_seqs) threadsPerThreadgroup:MTLSizeMake(d_state, 1, 1)];
                 } else {
                     GGML_ASSERT(d_inner == 1);
                     [encoder dispatchThreadgroups:MTLSizeMake(n_head, n_seqs, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
@@ -1751,6 +1751,306 @@ kernel void kernel_ssm_scan_f32(
     }
 }
 
+// ref: ggml.c:ggml_compute_forward_ssm_scan_f32, Mamba-2 part
+// WIP--- ghart
+kernel void kernel_ssm_scan_f32_group_GHART(
+        device const void * src0,
+        device const void * src1,
+        device const void * src2,
+        device const void * src3,
+        device const void * src4,
+        device const void * src5,
+        device const void * src6,
+        device      float * dst,
+        threadgroup float * shared [[threadgroup(0)]],
+        constant ggml_metal_kargs_ssm_scan & args,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        uint3  tpitg[[thread_position_in_threadgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        uint3    ntg[[threads_per_threadgroup]]) {
+
+    const int64_t i1 = tgpig.x;
+    const int64_t ir = tgpig.y; // current head
+    const int64_t i3 = tgpig.z; // current seq
+
+    const uint64_t nb00 = sizeof(float);
+    const uint64_t nb10 = sizeof(float);
+    const uint64_t nb20 = sizeof(float);
+
+    const int64_t nc  = args.d_state;
+    const int64_t nr  = args.d_inner;
+    const int64_t nh  = args.n_head;
+    const int64_t ng  = args.n_group;
+    const int64_t n_t = args.n_seq_tokens;
+
+    const int64_t s_off = args.s_off;
+
+    device const int32_t * ids = (device const int32_t *) src6;
+
+    device const float * s0 = (device const float *) ((device const char *) src0 + ir*args.nb02 + ids[i3]*args.nb03);
+    device       float * s  = (device       float *) ((device       char *) dst  + ir*args.nb02 +      i3*args.nb03 + s_off);
+
+    for (int64_t i2 = 0; i2 < n_t; ++i2) {
+        device const float * x  = (device const float *) ((device const char *) src1 + i1*nb10 + ir*args.nb11 + i2*args.nb12 + i3*args.nb13); // {dim, nh, nt, ns}
+        device const float * dt = (device const float *) ((device const char *) src2 + ir*nb20 + i2*args.nb21 + i3*args.nb22); // {nh, nt, ns}
+        device const float * A  = (device const float *) ((device const char *) src3 + ir*args.nb31); // {1, nh}
+        device const float * B  = (device const float *) ((device const char *) src4 + (ir & (ng - 1))*args.nb41 + i2*args.nb42 + i3*args.nb43); // {d_state, ng, nt, ns}
+        device const float * C  = (device const float *) ((device const char *) src5 + (ir & (ng - 1))*args.nb51 + i2*args.nb52 + i3*args.nb53); // {d_state, ng, nt, ns}
+        device       float * y  = (device       float *) ((device       char *) dst  + (i1 + ir*(nr) + i2*(nh*nr) + i3*(n_t*nh*nr))*nb00); // {dim, nh, nt, ns}
+
+        const float dt_soft_plus = dt[0] <= 20.0f ? log(1.0f + exp(dt[0])) : dt[0];
+        const float x_dt = x[0] * dt_soft_plus;
+        const float dA = exp(dt_soft_plus * A[0]);
+
+        /*
+
+        if (sgitg == 0) {
+            shared[tiisg] = 0.0f;
+        }
+
+        float sumf = 0;
+
+        for (int64_t i0 = tpitg.x; i0 < nc; i0 += ntg.x) {
+            const int64_t i = i0 + i1*nc;
+            const float state = (s0[i] * dA) + (B[i0] * x_dt);
+            sumf += state * C[i0];
+            s[i] = state;
+        }
+
+        sumf = simd_sum(sumf);
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        if (sgitg == 0) {
+            shared[sgitg] = sumf;
+        }
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        sumf = shared[tiisg];
+        sumf = simd_sum(sumf);
+
+        if (tpitg.x == 0) {
+            y[0] = sumf;
+        }
+
+        /*/
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        // if (sgitg == 0) {
+        //     shared[tiisg] = 0.0f;
+        // }
+
+        // float sumf = 0;
+
+        // Assuming num threads == d_state
+        // for (int64_t i0 = 0; i0 < nc; ++i0) {
+            const int64_t i = tpitg.x + i1*nc;
+            const float state = (s0[i] * dA) + (B[tpitg.x] * x_dt);
+            shared[tpitg.x] = state * C[tpitg.x];
+            // sumf += state * C[tpitg.x];
+            s[i] = state;
+        // }
+
+        // sumf = simd_sum(sumf);
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        // sumf = shared[tiisg];
+        // sumf = simd_sum(sumf);
+
+        // GG: vvv this sum is a big bottleneck!
+
+        float sumf = 0.0f;
+        for (int64_t i0 = 0; i0 < nc; ++i0) {
+            sumf += shared[i0];
+        }
+
+        y[0] = sumf;
+
+        //*/
+
+        // recurse
+        s0 = s;
+    }
+
+//----------------------------------
+
+    // //DEBUG
+    // const int64_t splitH  = 16;
+    // const int64_t d_state = 128;
+    // // const int64_t d_state = args.d_state;
+    // const int64_t WARP_SIZE = ntg.x;
+
+    // const int64_t d_head  = args.d_inner;
+    // const int64_t n_head  = args.n_head;
+    // const int64_t n_group = args.n_group;
+    // const int64_t n_tok   = args.n_seq_tokens;
+
+    // const int64_t head_idx = (tgpig.x * splitH) / d_head;
+    // const int64_t head_off = ((tgpig.x * splitH) % d_head) * sizeof(float);
+    // const int64_t seq_idx = tgpig.y;
+
+    // const int64_t group_off = (head_idx & (n_group - 1)) * d_state * sizeof(float);
+
+    // device const int32_t * ids = (device const int32_t *) src6;
+
+    // device const float * s0_block = (device const float *) ((device const char *) src0 + ids[seq_idx] * args.nb03 + head_idx * args.nb02 + head_off * d_state);
+    // device const float * x_block  = (device const float *) ((device const char *) src1 + (seq_idx * args.nb13) + tgpig.x * splitH * sizeof(float));
+    // device const float * dt_block = (device const float *) ((device const char *) src2 + (seq_idx * args.nb22) + head_idx * sizeof(float));
+    // device const float * A_block  = (device const float *) ((device const char *) src3 + head_idx * args.nb31);
+    // device const float * B_block  = (device const float *) ((device const char *) src4 + (seq_idx * args.nb43) + (group_off));
+    // device const float * C_block  = (device const float *) ((device const char *) src5 + (seq_idx * args.nb53) + (group_off));
+    // device       float * y_block  = dst + (seq_idx * n_tok * n_head * d_head) + tgpig.x * splitH;
+    // device       float * s_block  = (device float *) ((device char *) dst + args.s_off + seq_idx * args.nb03 + head_idx * args.nb02 + head_off * d_state);
+
+    // // strides across n_seq_tokens
+    // const int stride_x  = args.nb12 / sizeof(float);
+    // const int stride_dt = args.nb21 / sizeof(float);
+    // const int stride_B  = args.nb42 / sizeof(float);
+    // const int stride_C  = args.nb52 / sizeof(float);
+    // const int stride_y  = n_head * d_head;
+
+    // float state[splitH];
+    // // for the parallel accumulation
+
+    // //DEBUG -- TODO! No parallelism on accumulation
+    // float stateC[splitH * d_state];
+
+
+
+
+//----------------------------------
+
+
+
+// // #pragma unroll
+//     for (int j = 0; j < splitH; j++) {
+//         state[j] = s0_block[j * d_state + tpitg.x];
+//     }
+
+
+//     for (int64_t i = 0; i < n_tok; i++) {
+//         // TODO: only calculate dA and dt_soft_plus once per head instead of every splitH head elements
+//         // TODO: only calculate B and C once per head group
+//         // NOTE: dt_soft_plus, dA and x_dt have the same value across threads here.
+//         float dt_soft_plus = dt_block[i * stride_dt];
+//         if (dt_soft_plus <= 20.0f) {
+//             dt_soft_plus = log(exp(dt_soft_plus));
+//         }
+//         const float dA = exp(dt_soft_plus * A_block[0]);
+//         const float B = B_block[i * stride_B + tpitg.x];
+//         const float C = C_block[i * stride_C + tpitg.x];
+
+//         // across d_head
+// // #pragma unroll
+//         for (int j = 0; j < splitH; j++) {
+//             const float x_dt = x_block[i * stride_x + j] * dt_soft_plus;
+
+//             state[j] = (state[j] * dA) + (B * x_dt);
+
+//             stateC[j * d_state + tpitg.x] = state[j] * C;
+//         }
+
+//         //DEBUG
+//         // __syncthreads();
+
+//         // parallel accumulation for stateC
+//         // TODO: simplify
+//         {
+//             static_assert((d_state & -d_state) == d_state, "the state size has to be a power of 2");
+//             static_assert((splitH & -splitH) == splitH, "splitH has to be a power of 2");
+
+//             // reduce until w matches the warp size
+//             // TODO: does this work even when the physical warp size is 64?
+// // #pragma unroll
+//             for (int w = d_state; w > WARP_SIZE; w >>= 1) {
+//                 // (assuming there are d_state threads)
+// // #pragma unroll
+//                 for (int j = 0; j < ((w >> 1) * splitH + d_state - 1) / d_state; j++) {
+//                     // TODO: check for bank conflicts
+//                     const int k = (tpitg.x % (w >> 1)) + (d_state * (tpitg.x / (w >> 1))) + j * d_state * (d_state / (w >> 1));
+//                     stateC[k] += stateC[k + (w >> 1)];
+
+//                 }
+//                 //DEBUG
+//                 // __syncthreads();
+//             }
+
+//             // static_assert(splitH >= d_state / WARP_SIZE);
+
+// // #pragma unroll
+//             for (int j = 0; j < splitH / (d_state / WARP_SIZE); j++) {
+//                 float y = stateC[(tpitg.x % WARP_SIZE) + d_state * (tpitg.x / WARP_SIZE) + j * d_state * (d_state / WARP_SIZE)];
+//                 //DEBUG
+//                 // y = warp_reduce_sum(y);
+
+//                 // store the above accumulations
+//                 if (tpitg.x % WARP_SIZE == 0) {
+//                     const int k = tpitg.x / WARP_SIZE + j * (d_state / WARP_SIZE);
+//                     y_block[i * stride_y + k] = y;
+//                 }
+//             }
+//         }
+//     }
+
+//     // write back the state
+// // #pragma unroll
+//     for (int j = 0; j < splitH; j++) {
+//         s_block[j * d_state + tpitg.x] = state[j];
+//     }
+
+
+
+    // const int64_t i1 = tgpig.x;
+    // const int64_t ir = tgpig.y; // current head
+    // const int64_t i3 = tgpig.z; // current seq
+
+    // const uint64_t nb00 = sizeof(float);
+    // const uint64_t nb10 = sizeof(float);
+    // const uint64_t nb20 = sizeof(float);
+
+    // const int64_t nc  = args.d_state;
+    // const int64_t nr  = args.d_inner;
+    // const int64_t nh  = args.n_head;
+    // const int64_t ng  = args.n_group;
+    // const int64_t n_t = args.n_seq_tokens;
+
+    // const int64_t s_off = nr * nh * n_t * args.n_seqs * sizeof(float);
+
+    // device const int32_t * ids = (device const int32_t *) src6;
+
+    // device const float * s0 = (device const float *) ((device const char *) src0 + ir*args.nb02 + ids[i3]*args.nb03);
+    // device       float * s  = (device       float *) ((device       char *) dst  + ir*args.nb02 +      i3*args.nb03 + s_off);
+
+    // for (int64_t i2 = 0; i2 < n_t; ++i2) {
+    //     device const float * x  = (device const float *) ((device const char *) src1 + i1*nb10 + ir*args.nb11 + i2*args.nb12 + i3*args.nb13); // {dim, nh, nt, ns}
+    //     device const float * dt = (device const float *) ((device const char *) src2 + ir*nb20 + i2*args.nb21 + i3*args.nb22); // {nh, nt, ns}
+    //     device const float * A  = (device const float *) ((device const char *) src3 + ir*args.nb31); // {1, nh}
+    //     device const float * B  = (device const float *) ((device const char *) src4 + (ir & (ng - 1))*args.nb41 + i2*args.nb42 + i3*args.nb43); // {d_state, ng, nt, ns}
+    //     device const float * C  = (device const float *) ((device const char *) src5 + (ir & (ng - 1))*args.nb51 + i2*args.nb52 + i3*args.nb53); // {d_state, ng, nt, ns}
+    //     device       float * y  = (device       float *) ((device       char *) dst  + (i1 + ir*(nr) + i2*(nh*nr) + i3*(n_t*nh*nr))*nb00); // {dim, nh, nt, ns}
+
+    //     const float dt_soft_plus = dt[0] <= 20.0f ? log(1.0f + exp(dt[0])) : dt[0];
+    //     const float x_dt = x[0] * dt_soft_plus;
+    //     const float dA = exp(dt_soft_plus * A[0]);
+    //     float sumf = 0.0f;
+
+    //     for (int64_t i0 = 0; i0 < nc; ++i0) {
+    //         const int64_t i = i0 + i1*nc;
+    //         const float state = (s0[i] * dA) + (B[i0] * x_dt);
+    //         sumf += state * C[i0];
+    //         s[i] = state;
+    //     }
+
+    //     y[0] = sumf;
+
+    //     // recurse
+    //     s0 = s;
+    // }
+}
+
 // ref: ggml.c:ggml_compute_forward_ssm_scan_f32, Mamba-2 part
 // TODO: optimize (e.g. by parallelizing over d_state)
 kernel void kernel_ssm_scan_f32_group(