@@ -371,31 +371,11 @@ class llm_graph_input_mem_hybrid : public llm_graph_input_i {
371
371
// along with the input tensors, the object also provides commonly used outputs tensors, such as logits, embeddings, etc.
372
372
// these are used by the llama_context to extact the relevant data, based on the compute parameters
373
373
374
- // TODO: this interface seems redundant - remove it
375
- class llm_graph_result_i {
376
- public:
377
- virtual ~llm_graph_result_i () = default ;
378
-
379
- virtual ggml_tensor * get_tokens () const = 0;
380
- virtual ggml_tensor * get_logits () const = 0;
381
- virtual ggml_tensor * get_embd () const = 0;
382
- virtual ggml_tensor * get_embd_pooled () const = 0;
383
-
384
- virtual ggml_cgraph * get_gf () = 0;
385
- virtual ggml_context * get_ctx () = 0;
386
-
387
- virtual void reset () = 0;
388
-
389
- virtual void set_inputs (const llama_ubatch * ubatch) = 0;
390
-
391
- virtual bool can_reuse (const llm_graph_params & params) = 0;
392
- };
393
-
394
- using llm_graph_result_ptr = std::unique_ptr<llm_graph_result_i>;
395
-
396
374
// callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
397
375
using llm_graph_cb = std::function<void (const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il)>;
398
376
377
+ class llm_graph_result ;
378
+
399
379
struct llm_graph_params {
400
380
llm_arch arch = LLM_ARCH_UNKNOWN;
401
381
@@ -418,8 +398,7 @@ struct llm_graph_params {
418
398
419
399
llm_graph_cb cb;
420
400
421
- // TODO: temporary
422
- llm_graph_result_i * res;
401
+ llm_graph_result * res;
423
402
424
403
// return true if the "other" params would result in a graph with the same topology as with the current params
425
404
// having the same topology allows us to reuse the graph in some cases
@@ -462,27 +441,27 @@ struct llm_graph_params {
462
441
}
463
442
};
464
443
465
- class llm_graph_result : public llm_graph_result_i {
444
+ class llm_graph_result {
466
445
public:
467
446
llm_graph_result (int64_t max_nodes) : max_nodes(max_nodes) {
468
447
reset ();
469
448
}
470
449
471
450
virtual ~llm_graph_result () = default ;
472
451
473
- ggml_tensor * get_tokens () const override { return t_tokens; }
474
- ggml_tensor * get_logits () const override { return t_logits; }
475
- ggml_tensor * get_embd () const override { return t_embd; }
476
- ggml_tensor * get_embd_pooled () const override { return t_embd_pooled; }
452
+ ggml_tensor * get_tokens () const { return t_tokens; }
453
+ ggml_tensor * get_logits () const { return t_logits; }
454
+ ggml_tensor * get_embd () const { return t_embd; }
455
+ ggml_tensor * get_embd_pooled () const { return t_embd_pooled; }
477
456
478
- ggml_cgraph * get_gf () override { return gf; }
479
- ggml_context * get_ctx () override { return ctx_compute.get (); }
457
+ ggml_cgraph * get_gf () { return gf; }
458
+ ggml_context * get_ctx () { return ctx_compute.get (); }
480
459
481
460
void set_max_nodes (int64_t max_nodes) {
482
461
this ->max_nodes = max_nodes;
483
462
}
484
463
485
- void reset () override {
464
+ void reset () {
486
465
t_tokens = nullptr ;
487
466
t_logits = nullptr ;
488
467
t_embd = nullptr ;
@@ -503,7 +482,7 @@ class llm_graph_result : public llm_graph_result_i {
503
482
gf = ggml_new_graph_custom (ctx_compute.get (), max_nodes, false );
504
483
}
505
484
506
- void set_inputs (const llama_ubatch * ubatch) override {
485
+ void set_inputs (const llama_ubatch * ubatch) {
507
486
for (auto & input : inputs) {
508
487
input->set_input (ubatch);
509
488
}
@@ -514,7 +493,7 @@ class llm_graph_result : public llm_graph_result_i {
514
493
// would be identical to the existing graph. in that case, we simply have to update the memory
515
494
// contexts of the input tensors of the graph and we can reuse it for another computation
516
495
// return true if the graph was updated and can be reused
517
- bool can_reuse (const llm_graph_params & params) override {
496
+ bool can_reuse (const llm_graph_params & params) {
518
497
if (!this ->params .allow_reuse (params)) {
519
498
return false ;
520
499
}
@@ -533,6 +512,10 @@ class llm_graph_result : public llm_graph_result_i {
533
512
return inputs.back ().get ();
534
513
}
535
514
515
+ void set_params (const llm_graph_params & params) {
516
+ this ->params = params;
517
+ }
518
+
536
519
// important graph nodes
537
520
ggml_tensor * t_tokens = nullptr ;
538
521
ggml_tensor * t_logits = nullptr ;
@@ -550,12 +533,15 @@ class llm_graph_result : public llm_graph_result_i {
550
533
551
534
int64_t max_nodes;
552
535
536
+ private:
553
537
// keep a copy of the previous graph parameters
554
538
// we will use this to determine whether the graph can be reused by comparing them with the new parameters
555
539
// note: these are updated after constructing the new graph
556
540
llm_graph_params params;
557
541
};
558
542
543
+ using llm_graph_result_ptr = std::unique_ptr<llm_graph_result>;
544
+
559
545
//
560
546
// llm_graph_context
561
547
//
@@ -613,6 +599,7 @@ struct llm_graph_context {
613
599
llm_graph_result * res;
614
600
615
601
ggml_context * ctx0 = nullptr ;
602
+ ggml_cgraph * gf = nullptr ;
616
603
617
604
llm_graph_context (const llm_graph_params & params);
618
605
virtual ~llm_graph_context () = default ;
@@ -698,7 +685,6 @@ struct llm_graph_context {
698
685
//
699
686
700
687
ggml_tensor * build_attn_mha (
701
- ggml_cgraph * gf,
702
688
ggml_tensor * q, // [n_embd_head_q, n_head_q, n_tokens]
703
689
ggml_tensor * k, // [n_embd_head_k, n_head_k, n_tokens]
704
690
ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
@@ -711,7 +697,6 @@ struct llm_graph_context {
711
697
712
698
ggml_tensor * build_attn (
713
699
llm_graph_input_attn_no_cache * inp,
714
- ggml_cgraph * gf,
715
700
ggml_tensor * wo,
716
701
ggml_tensor * wo_b,
717
702
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
@@ -726,7 +711,6 @@ struct llm_graph_context {
726
711
727
712
ggml_tensor * build_attn (
728
713
llm_graph_input_attn_kv_unified * inp,
729
- ggml_cgraph * gf,
730
714
ggml_tensor * wo,
731
715
ggml_tensor * wo_b,
732
716
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
@@ -742,7 +726,6 @@ struct llm_graph_context {
742
726
// note: if k_cur or v_cur are not provided, they will not be stored in the memory
743
727
ggml_tensor * build_attn (
744
728
llm_graph_input_attn_kv_unified_iswa * inp,
745
- ggml_cgraph * gf,
746
729
ggml_tensor * wo,
747
730
ggml_tensor * wo_b,
748
731
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
@@ -757,7 +740,6 @@ struct llm_graph_context {
757
740
758
741
ggml_tensor * build_attn (
759
742
llm_graph_input_attn_cross * inp,
760
- ggml_cgraph * gf,
761
743
ggml_tensor * wo,
762
744
ggml_tensor * wo_b,
763
745
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
@@ -779,7 +761,6 @@ struct llm_graph_context {
779
761
// implementation in 2 separate methods. the goal is to avoid calling `ggml_build_forward_expand` in
780
762
// `llama_memory_recurrent`
781
763
ggml_tensor * build_rs (
782
- ggml_cgraph * gf,
783
764
ggml_tensor * s,
784
765
ggml_tensor * state_copy,
785
766
int32_t state_size,
@@ -794,17 +775,15 @@ struct llm_graph_context {
794
775
795
776
ggml_tensor * build_rs (
796
777
llm_graph_input_rs * inp,
797
- ggml_cgraph * gf,
798
778
ggml_tensor * s,
799
779
int32_t state_size,
800
780
int32_t n_seqs,
801
781
const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const ;
802
782
803
783
ggml_tensor * build_rwkv_token_shift_load (
804
784
llm_graph_input_rs * inp,
805
- ggml_cgraph * gf,
806
785
const llama_ubatch & ubatch,
807
- int il) const ;
786
+ int il) const ;
808
787
809
788
ggml_tensor * build_rwkv_token_shift_store (
810
789
ggml_tensor * token_shift,
@@ -821,7 +800,6 @@ struct llm_graph_context {
821
800
//
822
801
823
802
void build_pooling (
824
- ggml_cgraph * gf,
825
803
ggml_tensor * cls,
826
804
ggml_tensor * cls_b,
827
805
ggml_tensor * cls_out,
0 commit comments