@@ -2088,6 +2088,8 @@ struct server_context {
2088
2088
slot.t_start_process_prompt = ggml_time_us ();
2089
2089
slot.t_start_generation = 0 ;
2090
2090
2091
+ llama_token_healing_output token_healing_out{};
2092
+
2091
2093
if (slot.infill ) {
2092
2094
const bool add_bos = llama_should_add_bos_token (model);
2093
2095
bool suff_rm_leading_spc = true ;
@@ -2107,6 +2109,12 @@ struct server_context {
2107
2109
prefix_tokens.insert (prefix_tokens.begin (), llama_token_prefix (model));
2108
2110
suffix_tokens.insert (suffix_tokens.begin (), llama_token_suffix (model));
2109
2111
2112
+ if (slot.sparams .token_healing_enabled ) {
2113
+ // For FIM roll back only the prefix part (i.e. cursor location)
2114
+ token_healing_out = llama_token_healing_rollback (ctx, slot.sparams .token_healing_type ,
2115
+ prefix_tokens, slot.sparams .token_healing_n_rollback );
2116
+ }
2117
+
2110
2118
auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
2111
2119
auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
2112
2120
if (add_bos) {
@@ -2122,6 +2130,11 @@ struct server_context {
2122
2130
prompt_tokens = embd_inp;
2123
2131
} else {
2124
2132
prompt_tokens = tokenize (slot.prompt , system_prompt.empty ()); // add BOS if there isn't system prompt
2133
+
2134
+ if (slot.sparams .token_healing_enabled ) {
2135
+ token_healing_out = llama_token_healing_rollback (ctx, slot.sparams .token_healing_type ,
2136
+ prompt_tokens, slot.sparams .token_healing_n_rollback );
2137
+ }
2125
2138
}
2126
2139
2127
2140
slot.n_past = 0 ;
@@ -2136,6 +2149,16 @@ struct server_context {
2136
2149
{" prompt_tokens" , tokens_to_str (ctx, prompt_tokens.cbegin (), prompt_tokens.cend ())},
2137
2150
});
2138
2151
2152
+ if (slot.sparams .token_healing_enabled ) {
2153
+ slot.n_th_prefix = token_healing_out.prefix .size ();
2154
+ LOG_VERBOSE (" token healing prompt" , {
2155
+ {" id_slot" , slot.id },
2156
+ {" id_task" , slot.id_task },
2157
+ {" removed_suffix" , token_healing_out.prefix },
2158
+ {" n_tokens_removed" , token_healing_out.n_tokens_removed }
2159
+ });
2160
+ }
2161
+
2139
2162
// empty prompt passed -> release the slot and send empty response
2140
2163
if (prompt_tokens.empty ()) {
2141
2164
LOG_INFO (" empty prompt - releasing slot" , {
@@ -2151,21 +2174,6 @@ struct server_context {
2151
2174
continue ;
2152
2175
}
2153
2176
2154
- // Roll back prompt tokens if token healing
2155
- llama_token_healing_output token_healing_out{};
2156
- if (slot.sparams .token_healing_enabled ) {
2157
- token_healing_out = llama_token_healing_rollback (ctx, slot.sparams .token_healing_type ,
2158
- prompt_tokens, slot.sparams .token_healing_n_rollback );
2159
- slot.n_th_prefix = token_healing_out.prefix .size ();
2160
- slot.n_prompt_tokens = prompt_tokens.size ();
2161
- LOG_VERBOSE (" token healing prompt" , {
2162
- {" id_slot" , slot.id },
2163
- {" id_task" , slot.id_task },
2164
- {" removed_suffix" , token_healing_out.prefix },
2165
- {" n_tokens_removed" , token_healing_out.n_tokens_removed }
2166
- });
2167
- }
2168
-
2169
2177
if (slot.embedding ) {
2170
2178
// this prompt is too large to process - discard it
2171
2179
if (slot.n_prompt_tokens > n_ubatch) {
0 commit comments