@@ -9921,7 +9921,7 @@ struct llm_build_mamba : public llm_graph_context {
9921
9921
cur = build_mamba_layer(rs_inp, gf, cur, model, ubatch, il);
9922
9922
}
9923
9923
9924
- if (il == n_layer - 1) {
9924
+ if (il == n_layer - 1 && inp_out_ids ) {
9925
9925
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9926
9926
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
9927
9927
}
@@ -13785,6 +13785,8 @@ struct llm_build_granite_hybrid : public llm_graph_context {
13785
13785
13786
13786
auto * inp = build_inp_mem_hybrid();
13787
13787
13788
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
13789
+
13788
13790
// Positional embeddings populated if rope enabled
13789
13791
ggml_tensor * inp_pos = nullptr;
13790
13792
if (use_rope) {
@@ -13810,9 +13812,7 @@ struct llm_build_granite_hybrid : public llm_graph_context {
13810
13812
n_embd_head, use_rope, il);
13811
13813
}
13812
13814
13813
- if (il == n_layer - 1) {
13814
- // skip computing output for unused tokens
13815
- ggml_tensor * inp_out_ids = build_inp_out_ids();
13815
+ if (il == n_layer - 1 && inp_out_ids) {
13816
13816
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
13817
13817
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
13818
13818
}
0 commit comments