Pre-rope key cache #8377

rroykuo · 2024-07-08T18:12:02Z

rroykuo
Jul 8, 2024

Hi~ I'm just trying to store pre-rope key cache in order to make quantization better.
Thus, I skip ggml_rope_ext before it stores then modify something within llm_build_kqv (model: llama3)

static struct ggml_tensor * llm_build_kqv(
        struct ggml_context * ctx,
          const llama_model & model,
        const llama_hparams & hparams,
        const llama_cparams & cparams,
       const llama_kv_cache & kv,
         struct ggml_cgraph * graph,
         struct ggml_tensor * wo,
         struct ggml_tensor * wo_b,
         struct ggml_tensor * q_cur,
         struct ggml_tensor * kq_mask,
                    int32_t   n_tokens,
                    int32_t   n_kv,
                    float     kq_scale,
         const llm_build_cb & cb,
                    int       il) {
    const int64_t n_ctx         = cparams.n_ctx;
    const int64_t n_head        = hparams.n_head;
    const int64_t n_head_kv     = hparams.n_head_kv;
    const int64_t n_embd_head_k = hparams.n_embd_head_k;
    const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa();
    const int64_t n_embd_head_v = hparams.n_embd_head_v;
    const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa();

    struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
    cb(q, "q", il);

    struct ggml_tensor * k;
    if(cparams.pre_rope_cache && (kv.type_k==GGML_TYPE_F32 || kv.type_k==GGML_TYPE_F16)){
        k = ggml_rope_ext(
          ctx, ggml_view_3d(ctx, kv.k_l[il],
                    n_embd_head_k, n_head_kv, n_kv,
                    ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
                    ggml_row_size(kv.k_l[il]->type, n_embd_head_k * n_head_kv),
                    0), nullptr, nullptr,
          hparams.n_rot, hparams.rope_type, 0, cparams.n_yarn_orig_ctx,
          cparams.rope_freq_base, cparams.rope_freq_scale,
          cparams.yarn_ext_factor, cparams.yarn_attn_factor,
          cparams.yarn_beta_fast, cparams.yarn_beta_slow
        );
        k->ne[2] = kv.size;
        k = ggml_view_3d(ctx, k,
                    n_embd_head_k, n_kv, n_head_kv,
                    ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
                    ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
                    0);
    }else{
        k = ggml_view_3d(ctx, kv.k_l[il],
                    n_embd_head_k, n_kv, n_head_kv,
                    ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
                    ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
                    0);
    }
    cb(k, "k", il);

// skip below
}

I also modify GGML_OP of rope that would re-compute rope for all kv cache that used and I have checked this would work.
Unfortunately, when I try large context length (e.g. 32768) the performance would dramatically decline (from 11 tokens/s -> 1 token/s). The metrics is measured in first few tokens.

here's ggml_compute_forward_rope_f32 below

 const int32_t * pos = NULL;

    if(src1){
      pos = (const int32_t *) src1->data;
    }

    for (int64_t i3 = 0; i3 < ne3; i3++) {
        for (int64_t i2 = 0; i2 < ne2; i2++) {
            int64_t p;
            if(pos){
              p = pos[i2];
            }else{
              p = i2;
            }
     // skip below
        }  
    }

I only make array takes index for position when computing rope for key cache (inp_pos would be passed as nullptr)

The bottleneck is ggml_rope_ext and if context length set smaller (e.g. 512), the performance would not decline so much
(context length I mean is set by -c)

It should slow down for re-computing rope of each token but it is really slow when it just generate first few tokens.
I think first few tokens would not be impacted so much cause it only increase some computation initially.

rroykuo · 2024-07-09T07:50:50Z

rroykuo
Jul 9, 2024
Author

I found that each time it computes rope with ne[2] = 32768 each time which makes computation overhead.
I don't know why it is modified ne[2] before it computes rope. It's actually set ne[2] = n_kv within graph build session but it takes ne[2] = 32768 (modified by k->ne[2] = kv.size but this should not be combined within rope operation) within graph computation session.

I modified it into this and it works fine now

static struct ggml_tensor * llm_build_kqv(
        struct ggml_context * ctx,
          const llama_model & model,
        const llama_hparams & hparams,
        const llama_cparams & cparams,
       const llama_kv_cache & kv,
         struct ggml_cgraph * graph,
         struct ggml_tensor * wo,
         struct ggml_tensor * wo_b,
         struct ggml_tensor * q_cur,
         struct ggml_tensor * kq_mask,
                    int32_t   n_tokens,
                    int32_t   n_kv,
                    float     kq_scale,
         const llm_build_cb & cb,
                    int       il) {
    const int64_t n_ctx         = cparams.n_ctx;
    const int64_t n_head        = hparams.n_head;
    const int64_t n_head_kv     = hparams.n_head_kv;
    const int64_t n_embd_head_k = hparams.n_embd_head_k;
    const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa();
    const int64_t n_embd_head_v = hparams.n_embd_head_v;
    const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa();

    struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
    cb(q, "q", il);

    struct ggml_tensor * k;
    if(cparams.pre_rope_cache && (kv.type_k==GGML_TYPE_F32 || kv.type_k==GGML_TYPE_F16)){
        k = ggml_view_3d(ctx, kv.k_l[il], n_embd_head_k, n_head_kv, n_kv,
            ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
            ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
            0);
        k = ggml_rope_ext(
          ctx, k, nullptr, nullptr,
          hparams.n_rot, hparams.rope_type, 0, cparams.n_yarn_orig_ctx,
          cparams.rope_freq_base, cparams.rope_freq_scale,
          cparams.yarn_ext_factor, cparams.yarn_attn_factor,
          cparams.yarn_beta_fast, cparams.yarn_beta_slow
        );
        // k->ne[2] = kv.size;
        k = ggml_view_3d(ctx, k,
                    n_embd_head_k, n_kv, n_head_kv,
                    ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
                    ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
                    0);
    }else{
        k = ggml_view_3d(ctx, kv.k_l[il],
                    n_embd_head_k, n_kv, n_head_kv,
                    ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
                    ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
                    0);
    }
    cb(k, "k", il);
   // skip below
}

I don't know why it combines k->ne[2] = kv.size with ggml_rope_ext makes rope op take ne[2] as kv.size

1 reply

deleteeeee Jul 13, 2024

hello,if I want to improve it in powerinfer project. do you have any ideas. thanks

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Pre-rope key cache #8377

Uh oh!

{{title}}

Uh oh!

Replies: 1 comment 1 reply

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Select a reply

Uh oh!

Pre-rope key cache #8377

Uh oh!

rroykuo Jul 8, 2024

Replies: 1 comment · 1 reply

Uh oh!

rroykuo Jul 9, 2024 Author

Uh oh!

deleteeeee Jul 13, 2024

rroykuo
Jul 8, 2024

Replies: 1 comment 1 reply

rroykuo
Jul 9, 2024
Author