Skip to content

examples : predicted output for text generation #14739

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1294,7 +1294,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.use_color = true;
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_PREDICTED}));
add_opt(common_arg(
{"-t", "--threads"}, "N",
string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
Expand Down Expand Up @@ -1506,7 +1506,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.system_prompt = value;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_PREDICTED}));
add_opt(common_arg(
{"--no-perf"},
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
Expand Down Expand Up @@ -3186,14 +3186,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) {
params.speculative.n_max = value;
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_PREDICTED, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
add_opt(common_arg(
{"--draft-min", "--draft-n-min"}, "N",
string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
[](common_params & params, int value) {
params.speculative.n_min = value;
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_PREDICTED, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
add_opt(common_arg(
{"--draft-p-split"}, "P",
string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
Expand All @@ -3208,6 +3208,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.speculative.p_min = std::stof(value);
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
add_opt(common_arg(
{"--draft-text"}, "PROMPT",
"draft text to use for prediction (default: empty)",
[](common_params & params, const std::string & value) {
params.speculative.text = value;
}
).set_examples({LLAMA_EXAMPLE_PREDICTED}).set_env("LLAMA_ARG_DRAFT_TEXT"));
add_opt(common_arg(
{"-cd", "--ctx-size-draft"}, "N",
string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
Expand Down
4 changes: 3 additions & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ enum llama_example {
LLAMA_EXAMPLE_PARALLEL,
LLAMA_EXAMPLE_TTS,
LLAMA_EXAMPLE_DIFFUSION,

LLAMA_EXAMPLE_PREDICTED,
LLAMA_EXAMPLE_COUNT,
};

Expand Down Expand Up @@ -202,6 +202,8 @@ struct common_params_speculative {
float p_split = 0.1f; // speculative decoding split probability
float p_min = 0.75f; // minimum speculative decoding probability (greedy)

std::string text; // draft text to use for prediction

ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V

Expand Down
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ else()
add_subdirectory(simple-chat)
add_subdirectory(speculative)
add_subdirectory(speculative-simple)
add_subdirectory(predicted)
add_subdirectory(gen-docs)
add_subdirectory(training)
add_subdirectory(diffusion)
Expand Down
5 changes: 5 additions & 0 deletions examples/predicted/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
set(TARGET llama-predicted)
add_executable(${TARGET} predicted.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
10 changes: 10 additions & 0 deletions examples/predicted/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# llama.cpp/examples/predicted

Demonstration of predicted output generation with recovery. See `patch.sh` for an example and `lookup.sh` for comparison with lookup decoding.

# Algorithm

- `n_past`: cumulative number of tokens sampled (including the prompt)
- `use_draft`: whether we're using the draft or not
- `id_last`: the last token that has been sampled
- `batch_idx`: current index in the current batch (-1 means we need to decode a new batch)
7 changes: 7 additions & 0 deletions examples/predicted/data/patch_code.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<Frame fill margin={0.25}>
<Plot grid ylim={[-1.5, 1.5]} xlabel="Phase (radians)" ylabel="Interference" title="Flux Capacitance">
<SymFill fy1={sin} fy2={cos} xlim={[0, 2*pi]} fill={blue} opacity={0.5} />
<SymPath fy={sin} xlim={[0, 2*pi]} />
<SymPath fy={cos} xlim={[0, 2*pi]} />
</Plot>
</Frame>
18 changes: 18 additions & 0 deletions examples/predicted/lookup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# test lookahead sampler

# take input code from stdin and use prompt specified as argument

QUERY="$1"
CODE=$(cat)

BIN_DIR="../../build/bin"
BIN="$BIN_DIR/llama-lookup"

MODEL_DIR="../../fast_models"
MODEL="$MODEL_DIR/gemma-3-12b-it-q8_0.gguf"

SYSTEM="You are an assistant that makes changes to code. You are given a code snippet and a prompt. You need to make the changes to the code snippet to satisfy the prompt. You need to return the modified code snippet. Do not include other text or code block markers in your response."

PROMPT="${SYSTEM}\n\nPROMPT: ${QUERY}\n\nCODE:\n${CODE}\n\n"

$BIN -m $MODEL -c 4096 -ngl 99 -fa --color --prompt "${PROMPT}" --draft-min 5 --draft-max 32
17 changes: 17 additions & 0 deletions examples/predicted/osmosis.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# run osmosis query

BIN_DIR="../../build/bin"
BIN="$BIN_DIR/llama-predicted"

MODEL_DIR="../../fast_models"
MODEL="$MODEL_DIR/osmosis-apply-1.7b-bf16.gguf"

SYSTEM=$(cat data/osmosis_system.txt)
CODE=$(cat data/osmosis_code.txt)
EDIT=$(cat data/osmosis_edit.txt)

NL=$'\n'
PROMPT="<code>${NL}${CODE}${NL}</code>${NL}${NL}<edit>${NL}${EDIT}${NL}</edit>"
DRAFT="<code>${NL}${CODE}${NL}</code>"

$BIN -m "${MODEL}" -c 0 -ngl 99 -fa --color --system-prompt "${SYSTEM}" --prompt "${PROMPT}" --draft-text "${DRAFT}" --draft-min 5 --draft-max 32
18 changes: 18 additions & 0 deletions examples/predicted/patch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# test patch sampler

# take input code from stdin and use prompt specified as argument

QUERY="$1"
CODE=$(cat)

BIN_DIR="../../build/bin"
BIN="$BIN_DIR/llama-predicted"

MODEL_DIR="../../fast_models"
MODEL="$MODEL_DIR/gemma-3-12b-it-q8_0.gguf"

SYSTEM="You are an assistant that makes changes to code. You are given a code snippet and a prompt. You need to make the changes to the code snippet to satisfy the prompt. You need to return the modified code snippet. Do not include other text or code block markers in your response."

PROMPT="PROMPT: ${QUERY}\n\nCODE:\n${CODE}\n\n"

$BIN -m "${MODEL}" -c 0 -ngl 99 -fa --color --system-prompt "${SYSTEM}" --prompt "${PROMPT}" --draft-text "${CODE}" --draft-min 5 --draft-max 32
Loading
Loading