From 069f9ef4663f83948f25ec220237c5244668fd38 Mon Sep 17 00:00:00 2001 From: Ali Elmorsy Date: Thu, 17 Jul 2025 01:44:53 +0300 Subject: [PATCH 1/2] mtmd : Support jinja in libmtmd (Only for QwenVL and Qwen Omni) --- common/chat.cpp | 3 +- common/chat.h | 3 ++ include/llama.h | 3 ++ src/llama-arch.cpp | 2 ++ src/llama-arch.h | 3 ++ src/llama-vocab.cpp | 39 ++++++++++++++++++++++++ src/llama-vocab.h | 2 ++ tools/mtmd/clip.cpp | 2 +- tools/mtmd/mtmd-cli.cpp | 44 ++++++++++++++++----------- tools/mtmd/mtmd.cpp | 66 ++++++++++++++++++++++++++++++++++++++--- 10 files changed, 144 insertions(+), 23 deletions(-) diff --git a/common/chat.cpp b/common/chat.cpp index 114dbfccdbfe7..a21d09bb1da3a 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -1727,7 +1727,8 @@ static common_chat_params common_chat_templates_apply_jinja( : *tmpls->template_default; const auto & src = tmpl.source(); const auto & caps = tmpl.original_caps(); - params.messages = common_chat_msgs_to_json_oaicompat(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content); + bool concat_text = !inputs.no_part_concat && !tmpl.original_caps().requires_typed_content; + params.messages = common_chat_msgs_to_json_oaicompat(inputs.messages, concat_text); params.add_generation_prompt = inputs.add_generation_prompt; params.tool_choice = inputs.tool_choice; params.enable_thinking = inputs.enable_thinking; diff --git a/common/chat.h b/common/chat.h index ca807c145ee82..dc8510da20b7d 100644 --- a/common/chat.h +++ b/common/chat.h @@ -127,6 +127,9 @@ struct common_chat_templates_inputs { bool enable_thinking = true; std::chrono::system_clock::time_point now = std::chrono::system_clock::now(); std::map chat_template_kwargs; + + //If true, the jinja won't concat content parts into single part. That's useful for media parts + bool no_part_concat = false; }; struct common_chat_params { diff --git a/include/llama.h b/include/llama.h index dc86aea41dcbd..24a8552f5ae2e 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1057,6 +1057,9 @@ extern "C" { LLAMA_API llama_token llama_vocab_fim_rep(const struct llama_vocab * vocab); LLAMA_API llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab); + LLAMA_API llama_token llama_vocab_image_token(const struct llama_vocab * vocab); + LLAMA_API llama_token llama_vocab_audio_token(const struct llama_vocab * vocab); + DEPRECATED(LLAMA_API const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_text instead"); DEPRECATED(LLAMA_API float llama_token_get_score(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_score instead"); DEPRECATED(LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_attr instead"); diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 9af9c2ad604d5..ebb4a0e8941e5 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -217,6 +217,8 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_FIM_PAD_ID, "tokenizer.ggml.fim_pad_token_id" }, { LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" }, { LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" }, + {LLM_KV_TOKENIZER_IMAGE_ID, "tokenizer.ggml.image_token_id" }, + {LLM_KV_TOKENIZER_AUDIO_ID, "tokenizer.ggml.audio_token_id" }, { LLM_KV_ADAPTER_TYPE, "adapter.type" }, { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index ba5d03fa24ebe..3ee3b2f1cdd75 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -225,6 +225,9 @@ enum llm_kv { LLM_KV_CLASSIFIER_OUTPUT_LABELS, + LLM_KV_TOKENIZER_IMAGE_ID, + LLM_KV_TOKENIZER_AUDIO_ID, + // deprecated: LLM_KV_TOKENIZER_PREFIX_ID, LLM_KV_TOKENIZER_SUFFIX_ID, diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 551bba171c0e0..2eb32150b6812 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1265,6 +1265,8 @@ struct llama_vocab::impl { llama_token special_fim_pad_id = LLAMA_TOKEN_NULL; llama_token special_fim_rep_id = LLAMA_TOKEN_NULL; // repo llama_token special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator + llama_token special_image_id = LLAMA_TOKEN_NULL; + llama_token special_audio_id = LLAMA_TOKEN_NULL; // tokenizer flags bool add_space_prefix = false; @@ -1695,6 +1697,14 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false); } + const int image_idx = gguf_find_key(ctx,kv(LLM_KV_TOKENIZER_IMAGE_ID).c_str()); + if (image_idx != -1) { + special_image_id=gguf_get_val_u32(ctx,image_idx); + } + const int audio_idx = gguf_find_key(ctx,kv(LLM_KV_TOKENIZER_AUDIO_ID).c_str()); + if (audio_idx != -1) { + special_audio_id=gguf_get_val_u32(ctx,audio_idx); + } const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str()); if (token_idx == -1) { throw std::runtime_error("cannot find tokenizer vocab in model file\n"); @@ -1730,6 +1740,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { token_data.score = scores ? scores[i] : 0.0f; token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; + if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file switch(toktypes[i]) { case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break; @@ -1790,6 +1801,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { { LLM_KV_TOKENIZER_FIM_PAD_ID, special_fim_pad_id }, { LLM_KV_TOKENIZER_FIM_REP_ID, special_fim_rep_id }, { LLM_KV_TOKENIZER_FIM_SEP_ID, special_fim_sep_id }, + { LLM_KV_TOKENIZER_IMAGE_ID, special_image_id }, + { LLM_KV_TOKENIZER_AUDIO_ID, special_audio_id }, // deprecated { LLM_KV_TOKENIZER_PREFIX_ID, special_fim_pre_id }, @@ -1867,6 +1880,16 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { } } } + if (special_image_id==LLAMA_TOKEN_NULL) { + if (t.first=="<|IMAGE|>" || t.first=="") { + special_image_id=t.second; + } + } + if (special_audio_id==LLAMA_TOKEN_NULL) { + if (t.first=="<|AUDIO|>" || t.first=="