server : fix pooled embedding output (#14645)

iamlemec · web-flow · commit 0c1df14b5f8d · 2025-07-12T13:21:02.000+03:00
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
@@ -2581,25 +2581,27 @@ struct server_context {
                 continue;
             }
 
-            const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
-            if (embd == NULL) {
+            const float * embd = nullptr;
+            if (llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE) {
                 embd = llama_get_embeddings_ith(ctx, i);
+            } else {
+                embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
             }
 
-            if (embd == NULL) {
+            if (embd == nullptr) {
                 SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);
 
                 res->embedding.push_back(std::vector<float>(n_embd, 0.0f));
                 continue;
             }
 
             // normalize only when there is pooling
-            // TODO: configurable
             if (llama_pooling_type(slot.ctx) != LLAMA_POOLING_TYPE_NONE) {
                 common_embd_normalize(embd, embd_res.data(), n_embd, 2);
                 res->embedding.push_back(embd_res);
+                break;
             } else {
-                res->embedding.push_back({ embd, embd + n_embd });
+                res->embedding.emplace_back(embd, embd + n_embd);
             }
         }
 

Original file line number	Diff line number	Diff line change
`@@ -2581,25 +2581,27 @@ struct server_context {`
`2581`	`2581`	`continue;`
`2582`	`2582`	`}`
`2583`	`2583`
`2584`		`- const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);`
`2585`		`- if (embd == NULL) {`
	`2584`	`+ const float * embd = nullptr;`
	`2585`	`+ if (llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE) {`
`2586`	`2586`	`embd = llama_get_embeddings_ith(ctx, i);`
	`2587`	`+ } else {`
	`2588`	`+ embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);`
`2587`	`2589`	`}`
`2588`	`2590`
`2589`		`- if (embd == NULL) {`
	`2591`	`+ if (embd == nullptr) {`
`2590`	`2592`	`SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);`
`2591`	`2593`
`2592`	`2594`	`res->embedding.push_back(std::vector<float>(n_embd, 0.0f));`
`2593`	`2595`	`continue;`
`2594`	`2596`	`}`
`2595`	`2597`
`2596`	`2598`	`// normalize only when there is pooling`
`2597`		`- // TODO: configurable`
`2598`	`2599`	`if (llama_pooling_type(slot.ctx) != LLAMA_POOLING_TYPE_NONE) {`
`2599`	`2600`	`common_embd_normalize(embd, embd_res.data(), n_embd, 2);`
`2600`	`2601`	`res->embedding.push_back(embd_res);`
	`2602`	`+ break;`
`2601`	`2603`	`} else {`
`2602`		`- res->embedding.push_back({ embd, embd + n_embd });`
	`2604`	`+ res->embedding.emplace_back(embd, embd + n_embd);`
`2603`	`2605`	`}`
`2604`	`2606`	`}`
`2605`	`2607`