-
Notifications
You must be signed in to change notification settings - Fork 12.4k
DeepSeek V2/V3 MLA implementation #12801
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
8c02442
ddab5e4
7612566
fed6600
c449488
2a4e1b2
77fe59b
e215323
815f4f9
5778861
77ad5e4
5d037ae
638b092
a5df71e
925af99
36ce235
a574278
ed00f1e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,6 +10,7 @@ | |
#include <cstring> | ||
#include <stdexcept> | ||
#include <cinttypes> | ||
#include <math.h> | ||
|
||
// | ||
// llama_context | ||
|
@@ -473,7 +474,6 @@ ggml_tensor * llama_context::build_rope_shift( | |
const auto & n_ctx_orig = cparams.n_ctx_orig_yarn; | ||
|
||
const auto & yarn_ext_factor = cparams.yarn_ext_factor; | ||
const auto & yarn_attn_factor = cparams.yarn_attn_factor; | ||
const auto & yarn_beta_fast = cparams.yarn_beta_fast; | ||
const auto & yarn_beta_slow = cparams.yarn_beta_slow; | ||
|
||
|
@@ -482,6 +482,10 @@ ggml_tensor * llama_context::build_rope_shift( | |
const auto & n_rot = hparams.n_rot; | ||
const auto & rope_type = hparams.rope_type; | ||
|
||
// See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly. | ||
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation. | ||
const float yarn_attn_factor_scaled = model.arch == LLM_ARCH_DEEPSEEK2 ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)) : cparams.yarn_attn_factor; | ||
|
||
Comment on lines
+485
to
+488
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can this be absorbed in the |
||
ggml_tensor * tmp; | ||
|
||
if (ggml_is_quantized(cur->type)) { | ||
|
@@ -500,14 +504,14 @@ ggml_tensor * llama_context::build_rope_shift( | |
|
||
tmp = ggml_rope_ext_inplace(ctx0, tmp, | ||
shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, | ||
yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow); | ||
yarn_ext_factor, yarn_attn_factor_scaled, yarn_beta_fast, yarn_beta_slow); | ||
|
||
tmp = ggml_cpy(ctx0, tmp, cur); | ||
} else { | ||
// we rotate only the first n_rot dimensions | ||
tmp = ggml_rope_ext_inplace(ctx0, cur, | ||
shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, | ||
yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow); | ||
yarn_ext_factor, yarn_attn_factor_scaled, yarn_beta_fast, yarn_beta_slow); | ||
} | ||
|
||
return tmp; | ||
|
@@ -2274,6 +2278,11 @@ llama_context * llama_init_from_model( | |
params.flash_attn = false; | ||
} | ||
|
||
if (params.flash_attn && model->arch == LLM_ARCH_DEEPSEEK2) { | ||
LLAMA_LOG_WARN("%s: flash_attn is not compatible with Deepseek2 - forcing off\n", __func__); | ||
params.flash_attn = false; | ||
} | ||
|
||
Comment on lines
+2281
to
+2285
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is a way to enable the FA path to be compatible with MLA. I will fix this in a follow-up PR https://github.com/ggml-org/llama.cpp/tree/gg/mla. For now, keep it like this. |
||
if (ggml_is_quantized(params.type_v) && !params.flash_attn) { | ||
LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__); | ||
return nullptr; | ||
|
Uh oh!
There was an error while loading. Please reload this page.