model : gemma3n text-only (ggml-org#14400)

ngxson · Minh141120 · commit bd44d6aa806b · 2025-07-05T23:55:51.000+07:00
* gemma3n

* add llm_graph_input_one
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -936,11 +936,7 @@ def _create_vocab_sentencepiece(self):
         scores: list[float] = [-10000.0] * vocab_size
         toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
 
-        for token_id in range(tokenizer.vocab_size()):
-            if token_id >= vocab_size:
-                logger.warning(f'ignore tokens from {token_id}: id is out of range, max={vocab_size - 1}')
-                break
-
+        for token_id in range(vocab_size):
             piece = tokenizer.IdToPiece(token_id)
             text = piece.encode("utf-8")
             score = tokenizer.GetScore(token_id)
@@ -955,6 +951,10 @@ def _create_vocab_sentencepiece(self):
             elif tokenizer.IsByte(token_id):
                 toktype = SentencePieceTokenTypes.BYTE
 
+            if token_id >= vocab_size:
+                logger.warning(f'ignore tokens from {token_id}: id is out of range, max={vocab_size - 1}')
+                break
+
             tokens[token_id] = text
             scores[token_id] = score
             toktypes[token_id] = toktype
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -354,6 +354,12 @@ void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
     }
 }
 
+void llm_graph_input_one::set_input(const llama_ubatch *) {
+    GGML_ASSERT(one && ggml_nelements(one) == 1);
+    float f_one = 1.0f;
+    ggml_backend_tensor_set(one, &f_one, 0, sizeof(float));
+}
+
 //
 // llm_graph_context
 //
@@ -1299,9 +1305,12 @@ ggml_tensor * llm_graph_context::build_attn(
 
     const auto * mctx_cur = is_swa ? mctx_iswa->get_swa() : mctx_iswa->get_base();
 
-    // store to KV cache
-    {
+    // optionally store to KV cache
+    if (k_cur) {
         ggml_build_forward_expand(gf, mctx_cur->cpy_k(ctx0, k_cur, il));
+    }
+
+    if (v_cur) {
         ggml_build_forward_expand(gf, mctx_cur->cpy_v(ctx0, v_cur, il));
     }
 
diff --git a/src/llama-graph.h b/src/llama-graph.h
@@ -330,6 +330,17 @@ class llm_graph_input_mem_hybrid : public llm_graph_input_i {
     const llama_memory_hybrid_context * mctx;
 };
 
+// TODO: remove this when ggml_scale_add is implemented
+class llm_graph_input_one : public llm_graph_input_i {
+public:
+    llm_graph_input_one() {}
+    virtual ~llm_graph_input_one() = default;
+
+    void set_input(const llama_ubatch *) override;
+
+    ggml_tensor * one = nullptr; // F32
+};
+
 //
 // llm_graph_result
 //