Skip to content

Commit 7778c04

Browse files
aa956qnixsynapse
authored andcommitted
server : add server parameters for draft model cache type (ggml-org#13782)
Co-authored-by: aa956 <27946957+aa956@users.noreply.github.com>
1 parent ecfea67 commit 7778c04

File tree

2 files changed

+5
-4
lines changed

2 files changed

+5
-4
lines changed

common/common.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,9 @@ struct common_params_speculative {
199199
float p_split = 0.1f; // speculative decoding split probability
200200
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
201201

202+
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
203+
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
204+
202205
struct cpu_params cpuparams;
203206
struct cpu_params cpuparams_batch;
204207

tools/server/server.cpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1969,10 +1969,8 @@ struct server_context {
19691969
params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
19701970
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
19711971
params_dft.n_parallel = 1;
1972-
1973-
// force F16 KV cache for the draft model for extra performance
1974-
params_dft.cache_type_k = GGML_TYPE_F16;
1975-
params_dft.cache_type_v = GGML_TYPE_F16;
1972+
params_dft.cache_type_k = params_base.speculative.cache_type_k;
1973+
params_dft.cache_type_v = params_base.speculative.cache_type_v;
19761974

19771975
llama_init_dft = common_init_from_params(params_dft);
19781976

0 commit comments

Comments
 (0)