Skip to content

Commit 414fc13

Browse files
committed
token healing : refactor to return struct
1 parent db9c018 commit 414fc13

File tree

3 files changed

+44
-52
lines changed

3 files changed

+44
-52
lines changed

common/sampling.cpp

Lines changed: 20 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -49,18 +49,13 @@ static size_t get_max_token_length(const llama_context * ctx_main) {
4949
return len;
5050
}
5151

52-
struct token_healing_info {
53-
std::string prefix;
54-
int n_tokens_removed;
55-
};
56-
57-
token_healing_info llama_token_healing_get_prefix(
58-
const llama_context * ctx_main,
59-
const llama_token_healing_type th_type,
60-
const std::vector<llama_token> & tokens,
61-
int max_to_remove) {
52+
static llama_token_healing_output llama_token_healing_get_prefix(
53+
const llama_context * ctx_main,
54+
const llama_token_healing_type th_type,
55+
const std::vector<llama_token> & tokens,
56+
int max_to_remove) {
6257
if (tokens.size() <= 1) {
63-
return {"", 0};
58+
return {};
6459
}
6560

6661
const int n_ctx = tokens.size();
@@ -122,34 +117,28 @@ token_healing_info llama_token_healing_get_prefix(
122117
// Token healing (external)
123118
//
124119

125-
std::string llama_token_healing_rollback(
126-
const llama_context * ctx_main,
127-
llama_token_healing_type th_type,
128-
std::vector<llama_token> & tokens,
129-
int max_to_remove,
130-
int * n_removed) {
131-
if (n_removed != nullptr) {
132-
*n_removed = 0;
133-
}
120+
llama_token_healing_output llama_token_healing_rollback(
121+
const llama_context * ctx_main,
122+
llama_token_healing_type th_type,
123+
std::vector<llama_token> & tokens,
124+
int max_to_remove) {
134125
// NB. To avoid returning empty `tokens`, at least 1 token will remain in `tokens` after rolling back.
135126
// It is the caller's responsibility to add BOS to the start of the prompt if they want to roll back the whole prompt.
136-
token_healing_info info = llama_token_healing_get_prefix(ctx_main, th_type, tokens, max_to_remove);
127+
llama_token_healing_output out = llama_token_healing_get_prefix(ctx_main, th_type, tokens, max_to_remove);
137128

138129
// If constrained decoding would give back the original prompt, there is no need to modify the prompt.
139130
const bool is_multi_step = th_type == llama_token_healing_type::ROLLBACK_MULTI ||
140131
th_type == llama_token_healing_type::DYNAMIC_MULTI;
141-
const std::vector<llama_token> candidates = token_healing_get_candidates(ctx_main, info.prefix, is_multi_step);
142-
LOG("token_healing: prefix = '%s' (%d tokens)\n", info.prefix.c_str(), info.n_tokens_removed);
143-
if (info.n_tokens_removed == 1 && candidates.size() == 1) {
132+
const std::vector<llama_token> candidates = token_healing_get_candidates(ctx_main, out.prefix, is_multi_step);
133+
LOG("token_healing: prefix = '%s' (%d tokens)\n", out.prefix.c_str(), out.n_tokens_removed);
134+
if (out.n_tokens_removed == 1 && candidates.size() == 1) {
144135
LOG("token_healing: nothing to heal\n");
145-
return "";
136+
return {};
146137
}
147-
// Finalize outputs
148-
if (n_removed != nullptr) {
149-
*n_removed = info.n_tokens_removed;
150-
}
151-
tokens.resize(tokens.size() - info.n_tokens_removed);
152-
return info.prefix;
138+
139+
// Finally, trim prompt tokens
140+
tokens.resize(tokens.size() - out.n_tokens_removed);
141+
return out;
153142
}
154143

155144
void llama_token_healing_set_prefix(llama_sampling_context * ctx_sampling, const std::string & prefix) {

common/sampling.h

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -176,13 +176,17 @@ void llama_sampling_accept(
176176
// Token healing
177177
//
178178

179-
// Roll back `tokens` for constrained generation according to the token healing
180-
// strategy. Returns the prefix for constrained generation.
181-
std::string llama_token_healing_rollback(
182-
const llama_context * ctx_main,
183-
llama_token_healing_type th_type,
184-
std::vector<llama_token> & tokens,
185-
int max_to_remove = -1,
186-
int * n_removed = nullptr);
179+
struct llama_token_healing_output {
180+
std::string prefix;
181+
int n_tokens_removed;
182+
};
183+
184+
// Roll back `tokens` for constrained generation according to the token healing strategy.
185+
// Call `llama_token_healing_set_prefix` with the returned prefix before the first sampling.
186+
llama_token_healing_output llama_token_healing_rollback(
187+
const llama_context * ctx_main,
188+
llama_token_healing_type th_type,
189+
std::vector<llama_token> & tokens,
190+
int max_to_remove = -1);
187191

188192
void llama_token_healing_set_prefix(llama_sampling_context * ctx_sampling, const std::string & prefix);

examples/main/main.cpp

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -295,11 +295,10 @@ int main(int argc, char ** argv) {
295295
sparams.token_healing_enabled = false;
296296
LOG("token healing: disabled due to custom suffix/conversation mode");
297297
}
298-
std::string token_healing_prefix;
299-
int token_healing_n_removed = 0;
298+
llama_token_healing_output token_healing_out{};
300299
if (!params.interactive_first && sparams.token_healing_enabled) {
301-
token_healing_prefix = llama_token_healing_rollback(ctx, sparams.token_healing_type, embd_inp,
302-
sparams.token_healing_n_rollback, &token_healing_n_removed);
300+
token_healing_out = llama_token_healing_rollback(ctx, sparams.token_healing_type, embd_inp,
301+
sparams.token_healing_n_rollback);
303302
}
304303

305304
// Should not run without any tokens
@@ -326,7 +325,7 @@ int main(int argc, char ** argv) {
326325
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true);
327326
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
328327

329-
original_prompt_len = original_inp.size() - token_healing_n_removed;
328+
original_prompt_len = original_inp.size() - token_healing_out.n_tokens_removed;
330329
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
331330
LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
332331
LOG("guidance_offset: %s", log_tostr(guidance_offset));
@@ -548,7 +547,7 @@ int main(int argc, char ** argv) {
548547
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
549548
exit(1);
550549
}
551-
llama_token_healing_set_prefix(ctx_sampling, token_healing_prefix);
550+
llama_token_healing_set_prefix(ctx_sampling, token_healing_out.prefix);
552551

553552
if (llama_model_has_encoder(model)) {
554553
int enc_input_size = embd_inp.size();
@@ -883,7 +882,8 @@ int main(int argc, char ** argv) {
883882
assistant_ss << llama_token_to_piece(ctx, id, false);
884883
}
885884

886-
token_healing_n_removed = 0;
885+
token_healing_out = {};
886+
887887
if (n_past > 0 && is_interacting) {
888888
LOG("waiting for user input\n");
889889

@@ -962,9 +962,8 @@ int main(int argc, char ** argv) {
962962
const int max_to_remove = sparams.token_healing_n_rollback < 0
963963
? n_new_tokens
964964
: std::min(sparams.token_healing_n_rollback, n_new_tokens);
965-
token_healing_prefix = llama_token_healing_rollback(ctx, sparams.token_healing_type, embd_inp,
966-
max_to_remove, &token_healing_n_removed);
967-
n_bytes_to_skip = token_healing_prefix.size();
965+
token_healing_out = llama_token_healing_rollback(ctx, sparams.token_healing_type, embd_inp, max_to_remove);
966+
n_bytes_to_skip = token_healing_out.prefix.size();
968967
}
969968

970969
for (size_t i = original_size; i < embd_inp.size(); ++i) {
@@ -976,7 +975,7 @@ int main(int argc, char ** argv) {
976975
// reset assistant message
977976
assistant_ss.str("");
978977

979-
n_remain -= line_inp.size() + token_healing_n_removed;
978+
n_remain -= line_inp.size() + token_healing_out.n_tokens_removed;
980979
LOG("n_remain: %d\n", n_remain);
981980
} else {
982981
LOG("empty line, passing control back\n");
@@ -988,9 +987,9 @@ int main(int argc, char ** argv) {
988987
if (n_past > 0) {
989988
if (is_interacting) {
990989
llama_sampling_reset(ctx_sampling);
991-
if (token_healing_n_removed > 0) {
990+
if (token_healing_out.n_tokens_removed > 0) {
992991
// Set new prefix after an interaction
993-
llama_token_healing_set_prefix(ctx_sampling, token_healing_prefix);
992+
llama_token_healing_set_prefix(ctx_sampling, token_healing_out.prefix);
994993
}
995994
}
996995
is_interacting = false;

0 commit comments

Comments
 (0)