context : allow cache-less context for embeddings

ggerganov · ggerganov · commit 9770efa343c3 · 2025-05-03T11:21:55.000+03:00
ggml-ci
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
@@ -49,7 +49,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
         }
     } else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
         // decoder-only model
-        if (llama_decode(ctx, batch) < 0) {
+        if (llama_encode(ctx, batch) < 0) {
             LOG_ERR("%s : failed to decode\n", __func__);
         }
     }
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -253,7 +253,7 @@ llama_context::llama_context(
     }
 
     // reserve worst-case graph
-    if (!hparams.vocab_only) {
+    if (!hparams.vocab_only && memory) {
         const uint32_t n_seqs = 1; // TODO: worst-case number of sequences
         const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
 
@@ -763,12 +763,12 @@ int llama_context::encode(llama_batch & inp_batch) {
         ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
         GGML_ASSERT(backend_embd != nullptr);
 
-        GGML_ASSERT(embd != nullptr);
-
         switch (cparams.pooling_type) {
             case LLAMA_POOLING_TYPE_NONE:
                 {
                     // extract token embeddings
+                    GGML_ASSERT(embd != nullptr);
+
                     GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size);
                     ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd*sizeof(float));
                 } break;
@@ -840,6 +840,11 @@ int llama_context::decode(llama_batch & inp_batch) {
         return -1;
     }
 
+    if (!memory) {
+        LLAMA_LOG_WARN("%s: cannot decode batches with this context\n", __func__);
+        return -1;
+    }
+
     llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
 
     // temporary allocate memory for the input batch if needed
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -12833,6 +12833,12 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
     llama_memory_i * res;
 
     switch (arch) {
+        case LLM_ARCH_BERT:
+        case LLM_ARCH_JINA_BERT_V2:
+        case LLM_ARCH_NOMIC_BERT:
+            {
+                res = nullptr;
+            } break;
         case LLM_ARCH_MAMBA:
         case LLM_ARCH_RWKV6:
         case LLM_ARCH_RWKV6QWEN2:

Original file line number	Diff line number	Diff line change
`@@ -49,7 +49,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu`
`49`	`49`	`}`
`50`	`50`	`} else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {`
`51`	`51`	`// decoder-only model`
`52`		`- if (llama_decode(ctx, batch) < 0) {`
	`52`	`+ if (llama_encode(ctx, batch) < 0) {`
`53`	`53`	`LOG_ERR("%s : failed to decode\n", __func__);`
`54`	`54`	`}`
`55`	`55`	`}`