ggml-org · iamlemec · Jul 2, 2025
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -1294,7 +1294,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params) {
             params.use_color = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_PREDICTED}));
     add_opt(common_arg(
         {"-t", "--threads"}, "N",
         string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
@@ -1506,7 +1506,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.system_prompt = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_PREDICTED}));
     add_opt(common_arg(
         {"--no-perf"},
         string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
@@ -3186,14 +3186,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, int value) {
             params.speculative.n_max = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_PREDICTED, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
     add_opt(common_arg(
         {"--draft-min", "--draft-n-min"}, "N",
         string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
         [](common_params & params, int value) {
             params.speculative.n_min = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_PREDICTED, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
     add_opt(common_arg(
         {"--draft-p-split"}, "P",
         string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
@@ -3208,6 +3208,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.speculative.p_min = std::stof(value);
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
+    add_opt(common_arg(
+        {"--draft-text"}, "PROMPT",
+        "draft text to use for prediction (default: empty)",
+        [](common_params & params, const std::string & value) {
+            params.speculative.text = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PREDICTED}).set_env("LLAMA_ARG_DRAFT_TEXT"));
     add_opt(common_arg(
         {"-cd", "--ctx-size-draft"}, "N",
         string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),

diff --git a/common/common.h b/common/common.h
@@ -82,7 +82,7 @@ enum llama_example {
     LLAMA_EXAMPLE_PARALLEL,
     LLAMA_EXAMPLE_TTS,
     LLAMA_EXAMPLE_DIFFUSION,
-
+    LLAMA_EXAMPLE_PREDICTED,
     LLAMA_EXAMPLE_COUNT,
 };
 
@@ -202,6 +202,8 @@ struct common_params_speculative {
     float   p_split      =  0.1f; // speculative decoding split probability
     float   p_min        = 0.75f; // minimum speculative decoding probability (greedy)
 
+    std::string text; // draft text to use for prediction
+
     ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
     ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
 

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -31,6 +31,7 @@ else()
     add_subdirectory(simple-chat)
     add_subdirectory(speculative)
     add_subdirectory(speculative-simple)
+    add_subdirectory(predicted)
     add_subdirectory(gen-docs)
     add_subdirectory(training)
     add_subdirectory(diffusion)

diff --git a/examples/predicted/CMakeLists.txt b/examples/predicted/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET llama-predicted)
+add_executable(${TARGET} predicted.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/predicted/README.md b/examples/predicted/README.md
@@ -0,0 +1,10 @@
+# llama.cpp/examples/predicted
+
+Demonstration of predicted output generation with recovery. See `patch.sh` for an example and `lookup.sh` for comparison with lookup decoding.
+
+# Algorithm
+
+- `n_past`: cumulative number of tokens sampled (including the prompt)
+- `use_draft`: whether we're using the draft or not
+- `id_last`: the last token that has been sampled
+- `batch_idx`: current index in the current batch (-1 means we need to decode a new batch)
diff --git a/examples/predicted/data/patch_code.txt b/examples/predicted/data/patch_code.txt
@@ -0,0 +1,7 @@
+<Frame fill margin={0.25}>
+  <Plot grid ylim={[-1.5, 1.5]} xlabel="Phase (radians)" ylabel="Interference" title="Flux Capacitance">
+    <SymFill fy1={sin} fy2={cos} xlim={[0, 2*pi]} fill={blue} opacity={0.5} />
+    <SymPath fy={sin} xlim={[0, 2*pi]} />
+    <SymPath fy={cos} xlim={[0, 2*pi]} />
+  </Plot>
+</Frame>
diff --git a/examples/predicted/lookup.sh b/examples/predicted/lookup.sh
@@ -0,0 +1,18 @@
+# test lookahead sampler
+
+# take input code from stdin and use prompt specified as argument
+
+QUERY="$1"
+CODE=$(cat)
+
+BIN_DIR="../../build/bin"
+BIN="$BIN_DIR/llama-lookup"
+
+MODEL_DIR="../../fast_models"
+MODEL="$MODEL_DIR/gemma-3-12b-it-q8_0.gguf"
+
+SYSTEM="You are an assistant that makes changes to code. You are given a code snippet and a prompt. You need to make the changes to the code snippet to satisfy the prompt. You need to return the modified code snippet. Do not include other text or code block markers in your response."
+
+PROMPT="${SYSTEM}\n\nPROMPT: ${QUERY}\n\nCODE:\n${CODE}\n\n"
+
+$BIN -m $MODEL -c 4096 -ngl 99 -fa --color --prompt "${PROMPT}" --draft-min 5 --draft-max 32
diff --git a/examples/predicted/osmosis.sh b/examples/predicted/osmosis.sh
@@ -0,0 +1,17 @@
+# run osmosis query
+
+BIN_DIR="../../build/bin"
+BIN="$BIN_DIR/llama-predicted"
+
+MODEL_DIR="../../fast_models"
+MODEL="$MODEL_DIR/osmosis-apply-1.7b-bf16.gguf"
+
+SYSTEM=$(cat data/osmosis_system.txt)
+CODE=$(cat data/osmosis_code.txt)
+EDIT=$(cat data/osmosis_edit.txt)
+
+NL=$'\n'
+PROMPT="<code>${NL}${CODE}${NL}</code>${NL}${NL}<edit>${NL}${EDIT}${NL}</edit>"
+DRAFT="<code>${NL}${CODE}${NL}</code>"
+
+$BIN -m "${MODEL}" -c 0 -ngl 99 -fa --color --system-prompt "${SYSTEM}" --prompt "${PROMPT}" --draft-text "${DRAFT}" --draft-min 5 --draft-max 32
diff --git a/examples/predicted/patch.sh b/examples/predicted/patch.sh
@@ -0,0 +1,18 @@
+# test patch sampler
+
+# take input code from stdin and use prompt specified as argument
+
+QUERY="$1"
+CODE=$(cat)
+
+BIN_DIR="../../build/bin"
+BIN="$BIN_DIR/llama-predicted"
+
+MODEL_DIR="../../fast_models"
+MODEL="$MODEL_DIR/gemma-3-12b-it-q8_0.gguf"
+
+SYSTEM="You are an assistant that makes changes to code. You are given a code snippet and a prompt. You need to make the changes to the code snippet to satisfy the prompt. You need to return the modified code snippet. Do not include other text or code block markers in your response."
+
+PROMPT="PROMPT: ${QUERY}\n\nCODE:\n${CODE}\n\n"
+
+$BIN -m "${MODEL}" -c 0 -ngl 99 -fa --color --system-prompt "${SYSTEM}" --prompt "${PROMPT}" --draft-text "${CODE}" --draft-min 5 --draft-max 32