@@ -3245,13 +3245,15 @@ static bool llama_kv_cache_init(
3245
3245
cache.ctxs.push_back(ctx);
3246
3246
}
3247
3247
3248
- cache.k_l.reserve(n_layer);
3249
- cache.v_l.reserve(n_layer);
3250
-
3251
- // DeepSeek MLA
3252
- cache.kv_l.reserve(n_layer);
3253
- if (cparams.mla_attn == 1 && !cparams.flash_attn) {
3254
- cache.kvt_l.reserve(n_layer);
3248
+ if (model.arch == LLM_ARCH_DEEPSEEK2 && cparams.mla_attn) {
3249
+ // DeepSeek MLA
3250
+ cache.kv_l.reserve(n_layer);
3251
+ if (cparams.mla_attn == 1 && !cparams.flash_attn) {
3252
+ cache.kvt_l.reserve(n_layer);
3253
+ }
3254
+ } else {
3255
+ cache.k_l.reserve(n_layer);
3256
+ cache.v_l.reserve(n_layer);
3255
3257
}
3256
3258
3257
3259
bool warn = true;
@@ -3299,7 +3301,7 @@ static bool llama_kv_cache_init(
3299
3301
cache.v_l.push_back(v);
3300
3302
}
3301
3303
}
3302
- if (cparams.mla_attn && n_mla < n_layer && n_mla > 0) {
3304
+ if (model.arch == LLM_ARCH_DEEPSEEK2 && cparams.mla_attn && n_mla < n_layer && n_mla > 0) {
3303
3305
LLAMA_LOG_ERROR("%s: unexpected situation with %d out of %d layers having MLA enabled\n", __func__, n_mla, int(n_layer));
3304
3306
LLAMA_LOG_ERROR("%s: bailing out\n", __func__);
3305
3307
GGML_ABORT("fatal error");
@@ -18568,6 +18570,13 @@ struct llama_context * llama_new_context_with_model(
18568
18570
params.seed = time(NULL);
18569
18571
}
18570
18572
18573
+ if (model->arch != LLM_ARCH_DEEPSEEK2 && cparams.mla_attn > 0) {
18574
+ LLAMA_LOG_WARN("=====================================================================\n");
18575
+ LLAMA_LOG_WARN(" MLA is only available for LLM_ARCH_DEEPSEEK2 -> turning off MLA\n");
18576
+ LLAMA_LOG_WARN("=====================================================================\n");
18577
+ cparams.mla_attn = 0;
18578
+ }
18579
+
18571
18580
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
18572
18581
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
18573
18582
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
0 commit comments