Skip to content

Commit 3ba5c55

Browse files
committed
server : token healing for infilling/FIM
1 parent d5eea13 commit 3ba5c55

File tree

1 file changed

+23
-15
lines changed

1 file changed

+23
-15
lines changed

examples/server/server.cpp

Lines changed: 23 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2088,6 +2088,8 @@ struct server_context {
20882088
slot.t_start_process_prompt = ggml_time_us();
20892089
slot.t_start_generation = 0;
20902090

2091+
llama_token_healing_output token_healing_out{};
2092+
20912093
if (slot.infill) {
20922094
const bool add_bos = llama_should_add_bos_token(model);
20932095
bool suff_rm_leading_spc = true;
@@ -2107,6 +2109,12 @@ struct server_context {
21072109
prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
21082110
suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model));
21092111

2112+
if (slot.sparams.token_healing_enabled) {
2113+
// For FIM roll back only the prefix part (i.e. cursor location)
2114+
token_healing_out = llama_token_healing_rollback(ctx, slot.sparams.token_healing_type,
2115+
prefix_tokens, slot.sparams.token_healing_n_rollback);
2116+
}
2117+
21102118
auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
21112119
auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
21122120
if (add_bos) {
@@ -2122,6 +2130,11 @@ struct server_context {
21222130
prompt_tokens = embd_inp;
21232131
} else {
21242132
prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
2133+
2134+
if (slot.sparams.token_healing_enabled) {
2135+
token_healing_out = llama_token_healing_rollback(ctx, slot.sparams.token_healing_type,
2136+
prompt_tokens, slot.sparams.token_healing_n_rollback);
2137+
}
21252138
}
21262139

21272140
slot.n_past = 0;
@@ -2136,6 +2149,16 @@ struct server_context {
21362149
{"prompt_tokens", tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())},
21372150
});
21382151

2152+
if (slot.sparams.token_healing_enabled) {
2153+
slot.n_th_prefix = token_healing_out.prefix.size();
2154+
LOG_VERBOSE("token healing prompt", {
2155+
{"id_slot", slot.id},
2156+
{"id_task", slot.id_task},
2157+
{"removed_suffix", token_healing_out.prefix},
2158+
{"n_tokens_removed", token_healing_out.n_tokens_removed}
2159+
});
2160+
}
2161+
21392162
// empty prompt passed -> release the slot and send empty response
21402163
if (prompt_tokens.empty()) {
21412164
LOG_INFO("empty prompt - releasing slot", {
@@ -2151,21 +2174,6 @@ struct server_context {
21512174
continue;
21522175
}
21532176

2154-
// Roll back prompt tokens if token healing
2155-
llama_token_healing_output token_healing_out{};
2156-
if (slot.sparams.token_healing_enabled) {
2157-
token_healing_out = llama_token_healing_rollback(ctx, slot.sparams.token_healing_type,
2158-
prompt_tokens, slot.sparams.token_healing_n_rollback);
2159-
slot.n_th_prefix = token_healing_out.prefix.size();
2160-
slot.n_prompt_tokens = prompt_tokens.size();
2161-
LOG_VERBOSE("token healing prompt", {
2162-
{"id_slot", slot.id},
2163-
{"id_task", slot.id_task},
2164-
{"removed_suffix", token_healing_out.prefix},
2165-
{"n_tokens_removed", token_healing_out.n_tokens_removed}
2166-
});
2167-
}
2168-
21692177
if (slot.embedding) {
21702178
// this prompt is too large to process - discard it
21712179
if (slot.n_prompt_tokens > n_ubatch) {

0 commit comments

Comments
 (0)