find result_norm/result_embd tensors properly; update output allocation logic

iamlemec · iamlemec · commit 1756c4b5b69d · 2024-06-14T12:31:18.000-06:00
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
@@ -31,8 +31,8 @@ static bool needs_logit(enum llama_pooling_type pooling_type, int pos, int n_tok
     }
 }
 
-static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id, enum llama_pooling_type pooling_type) {
-    int n_tokens = tokens.size();
+static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id, enum llama_pooling_type pooling_type) {
+    size_t n_tokens = tokens.size();
     for (size_t i = 0; i < n_tokens; i++) {
         bool logit = needs_logit(pooling_type, i, n_tokens);
         llama_batch_add(batch, tokens[i], i, { seq_id }, logit);
diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp
@@ -87,9 +87,9 @@ static bool needs_logit(enum llama_pooling_type pooling_type, int pos, int n_tok
     }
 }
 
-static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id, enum llama_pooling_type pooling_type) {
-    int n_tokens = tokens.size();
-    for (size_t i = 0; i < tokens.size(); i++) {
+static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id, enum llama_pooling_type pooling_type) {
+    size_t n_tokens = tokens.size();
+    for (size_t i = 0; i < n_tokens; i++) {
         bool logit = needs_logit(pooling_type, i, n_tokens);
         llama_batch_add(batch, tokens[i], i, { seq_id }, logit);
     }
diff --git a/llama.cpp b/llama.cpp
@@ -7436,11 +7436,17 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
-        struct ggml_tensor * inp = gf->nodes[gf->n_nodes - 1];
-        if (strcmp(inp->name, "result_embd") != 0) {
-            inp = gf->nodes[gf->n_nodes - 2];
-            GGML_ASSERT(strcmp(inp->name, "result_norm") == 0 && "embeddings tensor not found");
+        // find result_norm tensor for input
+        struct ggml_tensor * inp = nullptr;
+        for (int i = gf->n_nodes - 1; i >= 0; --i) {
+            inp = gf->nodes[i];
+            if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
+                break;
+            } else {
+                inp = nullptr;
+            }
         }
+        GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor");
 
         struct ggml_tensor * cur;
 
@@ -12029,8 +12035,8 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
     const auto n_embd  = hparams.n_embd;
 
     // TODO: use a per-batch flag for logits presence instead
-    const bool has_logits = cparams.causal_attn;
-    const bool has_embd   = cparams.embeddings && (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
+    const bool has_logits = !cparams.embeddings;
+    const bool has_embd   =  cparams.embeddings;
 
     const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
     const size_t embd_size   = has_embd   ?  n_embd*n_outputs_max : 0;

Original file line number	Diff line number	Diff line change
`@@ -31,8 +31,8 @@ static bool needs_logit(enum llama_pooling_type pooling_type, int pos, int n_tok`
`31`	`31`	`}`
`32`	`32`	`}`
`33`	`33`
`34`		`-static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id, enum llama_pooling_type pooling_type) {`
`35`		`- int n_tokens = tokens.size();`
	`34`	`+static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id, enum llama_pooling_type pooling_type) {`
	`35`	`+ size_t n_tokens = tokens.size();`
`36`	`36`	`for (size_t i = 0; i < n_tokens; i++) {`
`37`	`37`	`bool logit = needs_logit(pooling_type, i, n_tokens);`
`38`	`38`	`llama_batch_add(batch, tokens[i], i, { seq_id }, logit);`
Original file line number	Diff line number	Diff line change
`@@ -87,9 +87,9 @@ static bool needs_logit(enum llama_pooling_type pooling_type, int pos, int n_tok`
`87`	`87`	`}`
`88`	`88`	`}`
`89`	`89`
`90`		`-static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id, enum llama_pooling_type pooling_type) {`
`91`		`- int n_tokens = tokens.size();`
`92`		`- for (size_t i = 0; i < tokens.size(); i++) {`
	`90`	`+static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id, enum llama_pooling_type pooling_type) {`
	`91`	`+ size_t n_tokens = tokens.size();`
	`92`	`+ for (size_t i = 0; i < n_tokens; i++) {`
`93`	`93`	`bool logit = needs_logit(pooling_type, i, n_tokens);`
`94`	`94`	`llama_batch_add(batch, tokens[i], i, { seq_id }, logit);`
`95`	`95`	`}`