File tree Expand file tree Collapse file tree 2 files changed +5
-4
lines changed Expand file tree Collapse file tree 2 files changed +5
-4
lines changed Original file line number Diff line number Diff line change @@ -199,6 +199,9 @@ struct common_params_speculative {
199
199
float p_split = 0 .1f ; // speculative decoding split probability
200
200
float p_min = 0 .75f ; // minimum speculative decoding probability (greedy)
201
201
202
+ ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
203
+ ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
204
+
202
205
struct cpu_params cpuparams;
203
206
struct cpu_params cpuparams_batch;
204
207
Original file line number Diff line number Diff line change @@ -1969,10 +1969,8 @@ struct server_context {
1969
1969
params_dft.n_ctx = params_base.speculative .n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative .n_ctx ;
1970
1970
params_dft.n_gpu_layers = params_base.speculative .n_gpu_layers ;
1971
1971
params_dft.n_parallel = 1 ;
1972
-
1973
- // force F16 KV cache for the draft model for extra performance
1974
- params_dft.cache_type_k = GGML_TYPE_F16;
1975
- params_dft.cache_type_v = GGML_TYPE_F16;
1972
+ params_dft.cache_type_k = params_base.speculative .cache_type_k ;
1973
+ params_dft.cache_type_v = params_base.speculative .cache_type_v ;
1976
1974
1977
1975
llama_init_dft = common_init_from_params (params_dft);
1978
1976
You can’t perform that action at this time.
0 commit comments