Skip to content

Commit 1a59845

Browse files
committed
graph : rename update() to can_reuse()
ggml-ci
1 parent 8b06546 commit 1a59845

File tree

6 files changed

+19
-25
lines changed

6 files changed

+19
-25
lines changed

include/llama.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -375,8 +375,6 @@ extern "C" {
375375
bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
376376
// NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
377377
// ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
378-
379-
bool graph_reuse; // reuse previous compute graphs when possible
380378
};
381379

382380
// model quantization parameters

src/llama-context.cpp

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -101,8 +101,7 @@ llama_context::llama_context(
101101

102102
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
103103

104-
cparams.op_offload = params.op_offload;
105-
cparams.graph_reuse = params.graph_reuse;
104+
cparams.op_offload = params.op_offload;
106105

107106
const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
108107

@@ -689,9 +688,9 @@ llm_graph_result_i * llama_context::process_ubatch(const llama_ubatch & ubatch,
689688
// in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters
690689
const auto gparams = graph_params(res, ubatch, mctx, gtype);
691690

692-
const bool can_reuse = cparams.graph_reuse && res->update(gparams);
693-
if (can_reuse) {
694-
LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
691+
if (res->can_reuse(gparams)) {
692+
//LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
693+
695694
n_reused++;
696695
} else {
697696
res->reset();
@@ -2186,7 +2185,6 @@ llama_context_params llama_context_default_params() {
21862185
/*.no_perf =*/ true,
21872186
/*.op_offload =*/ true,
21882187
/*.swa_full =*/ true,
2189-
/*.graph_reuse =*/ false,
21902188
};
21912189

21922190
return result;

src/llama-cparams.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ bool llama_cparams::is_same(const llama_cparams & other) const {
2727
no_perf == other.no_perf &&
2828
warmup == other.warmup &&
2929
op_offload == other.op_offload &&
30-
graph_reuse == other.graph_reuse &&
3130
pooling_type == other.pooling_type &&
3231
cb_eval == other.cb_eval &&
3332
cb_eval_user_data == other.cb_eval_user_data;

src/llama-cparams.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ struct llama_cparams {
3333
bool no_perf;
3434
bool warmup;
3535
bool op_offload;
36-
bool graph_reuse;
3736

3837
enum llama_pooling_type pooling_type;
3938

src/llama-graph.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
2828
}
2929
}
3030

31-
bool llm_graph_input_embd::update(const llm_graph_params & params) {
31+
bool llm_graph_input_embd::can_reuse(const llm_graph_params & params) {
3232
bool res = true;
3333

3434
res &= (!tokens && !params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens);
@@ -59,7 +59,7 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
5959
}
6060
}
6161

62-
bool llm_graph_input_pos::update(const llm_graph_params & params) {
62+
bool llm_graph_input_pos::can_reuse(const llm_graph_params & params) {
6363
bool res = true;
6464

6565
res &= pos->ne[0] == params.ubatch.n_tokens;
@@ -135,7 +135,7 @@ void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) {
135135
}
136136
}
137137

138-
bool llm_graph_input_out_ids::update(const llm_graph_params & params) {
138+
bool llm_graph_input_out_ids::can_reuse(const llm_graph_params & params) {
139139
bool res = true;
140140

141141
res &= n_outputs == params.n_outputs;
@@ -312,7 +312,7 @@ void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
312312
mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
313313
}
314314

315-
bool llm_graph_input_attn_kv_unified::update(const llm_graph_params & params) {
315+
bool llm_graph_input_attn_kv_unified::can_reuse(const llm_graph_params & params) {
316316
const auto * mctx = static_cast<const llama_kv_cache_unified_context *>(params.mctx);
317317

318318
this->mctx = mctx;
@@ -342,7 +342,7 @@ void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch
342342
mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
343343
}
344344

345-
bool llm_graph_input_attn_kv_unified_iswa::update(const llm_graph_params & params) {
345+
bool llm_graph_input_attn_kv_unified_iswa::can_reuse(const llm_graph_params & params) {
346346
const auto * mctx = static_cast<const llama_kv_cache_unified_iswa_context *>(params.mctx);
347347

348348
this->mctx = mctx;

src/llama-graph.h

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ class llm_graph_input_i {
8383

8484
// return true if the resulting input tensors using the provided graph parameters would be
8585
// the same as the previous input tensors that we have currently stored in the object
86-
virtual bool update(const llm_graph_params & params) {
86+
virtual bool can_reuse(const llm_graph_params & params) {
8787
// returning false here by default will prevent from reusing the graph if the check
8888
// for the input type has not been implemented yet
8989
GGML_UNUSED(params);
@@ -100,7 +100,7 @@ class llm_graph_input_embd : public llm_graph_input_i {
100100

101101
void set_input(const llama_ubatch * ubatch) override;
102102

103-
bool update(const llm_graph_params & params) override;
103+
bool can_reuse(const llm_graph_params & params) override;
104104

105105
ggml_tensor * tokens = nullptr; // I32 [n_batch]
106106
ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
@@ -113,7 +113,7 @@ class llm_graph_input_pos : public llm_graph_input_i {
113113

114114
void set_input(const llama_ubatch * ubatch) override;
115115

116-
bool update(const llm_graph_params & params) override;
116+
bool can_reuse(const llm_graph_params & params) override;
117117

118118
ggml_tensor * pos = nullptr; // I32 [n_batch]
119119

@@ -173,7 +173,7 @@ class llm_graph_input_out_ids : public llm_graph_input_i {
173173

174174
void set_input(const llama_ubatch * ubatch) override;
175175

176-
bool update(const llm_graph_params & params) override;
176+
bool can_reuse(const llm_graph_params & params) override;
177177

178178
ggml_tensor * out_ids; // I32 [n_outputs]
179179

@@ -265,7 +265,7 @@ class llm_graph_input_attn_kv_unified : public llm_graph_input_i {
265265

266266
void set_input(const llama_ubatch * ubatch) override;
267267

268-
bool update(const llm_graph_params & params) override;
268+
bool can_reuse(const llm_graph_params & params) override;
269269

270270
ggml_tensor * get_k_idxs() const { return self_k_idxs; }
271271
ggml_tensor * get_v_idxs() const { return self_v_idxs; }
@@ -298,7 +298,7 @@ class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i {
298298

299299
void set_input(const llama_ubatch * ubatch) override;
300300

301-
bool update(const llm_graph_params & params) override;
301+
bool can_reuse(const llm_graph_params & params) override;
302302

303303
ggml_tensor * get_k_idxs() const { return self_k_idxs; }
304304
ggml_tensor * get_v_idxs() const { return self_v_idxs; }
@@ -388,7 +388,7 @@ class llm_graph_result_i {
388388

389389
virtual void set_inputs(const llama_ubatch * ubatch) = 0;
390390

391-
virtual bool update(const llm_graph_params & params) = 0;
391+
virtual bool can_reuse(const llm_graph_params & params) = 0;
392392
};
393393

394394
using llm_graph_result_ptr = std::unique_ptr<llm_graph_result_i>;
@@ -482,20 +482,20 @@ class llm_graph_result : public llm_graph_result_i {
482482
}
483483
}
484484

485-
// try to update the existing graph result using the new graph parameters
485+
// try to update the existing graph result using the new graph parameters in order to reuse it
486486
// this can only be done if we determine that the resulting graph using the new graph parameters
487487
// would be identical to the existing graph. in that case, we simply have to update the memory
488488
// contexts of the input tensors of the graph and we can reuse it for another computation
489489
// return true if the graph was updated and can be reused
490-
bool update(const llm_graph_params & params) override {
490+
bool can_reuse(const llm_graph_params & params) override {
491491
if (!this->params.is_same(params)) {
492492
return false;
493493
}
494494

495495
bool res = true;
496496

497497
for (auto & input : inputs) {
498-
res &= input->update(params);
498+
res &= input->can_reuse(params);
499499
}
500500

501501
return res;

0 commit comments

Comments
 (0)