diff --git a/common/chat.cpp b/common/chat.cpp index 114dbfccdbfe7..a21d09bb1da3a 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -1727,7 +1727,8 @@ static common_chat_params common_chat_templates_apply_jinja( : *tmpls->template_default; const auto & src = tmpl.source(); const auto & caps = tmpl.original_caps(); - params.messages = common_chat_msgs_to_json_oaicompat(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content); + bool concat_text = !inputs.no_part_concat && !tmpl.original_caps().requires_typed_content; + params.messages = common_chat_msgs_to_json_oaicompat(inputs.messages, concat_text); params.add_generation_prompt = inputs.add_generation_prompt; params.tool_choice = inputs.tool_choice; params.enable_thinking = inputs.enable_thinking; diff --git a/common/chat.h b/common/chat.h index ca807c145ee82..dc8510da20b7d 100644 --- a/common/chat.h +++ b/common/chat.h @@ -127,6 +127,9 @@ struct common_chat_templates_inputs { bool enable_thinking = true; std::chrono::system_clock::time_point now = std::chrono::system_clock::now(); std::map chat_template_kwargs; + + //If true, the jinja won't concat content parts into single part. That's useful for media parts + bool no_part_concat = false; }; struct common_chat_params { diff --git a/include/llama.h b/include/llama.h index db6a5337b02a7..581b7048085c1 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1021,6 +1021,9 @@ extern "C" { LLAMA_API llama_token llama_vocab_fim_rep(const struct llama_vocab * vocab); LLAMA_API llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab); + LLAMA_API llama_token llama_vocab_image_token(const struct llama_vocab * vocab); + LLAMA_API llama_token llama_vocab_audio_token(const struct llama_vocab * vocab); + DEPRECATED(LLAMA_API const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_text instead"); DEPRECATED(LLAMA_API float llama_token_get_score(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_score instead"); DEPRECATED(LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_attr instead"); diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 9454d04e53801..6d42b3d84d521 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -224,6 +224,8 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_FIM_PAD_ID, "tokenizer.ggml.fim_pad_token_id" }, { LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" }, { LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" }, + {LLM_KV_TOKENIZER_IMAGE_ID, "tokenizer.ggml.image_token_id" }, + {LLM_KV_TOKENIZER_AUDIO_ID, "tokenizer.ggml.audio_token_id" }, { LLM_KV_ADAPTER_TYPE, "adapter.type" }, { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index 0ead0d6cdb11b..13284c1b40eb1 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -230,8 +230,13 @@ enum llm_kv { LLM_KV_CLASSIFIER_OUTPUT_LABELS, + + LLM_KV_TOKENIZER_IMAGE_ID, + LLM_KV_TOKENIZER_AUDIO_ID, + LLM_KV_SHORTCONV_L_CACHE, + // deprecated: LLM_KV_TOKENIZER_PREFIX_ID, LLM_KV_TOKENIZER_SUFFIX_ID, diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 2181c01e31a87..8db8a2782a971 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1551,6 +1551,8 @@ struct llama_vocab::impl { llama_token special_fim_pad_id = LLAMA_TOKEN_NULL; llama_token special_fim_rep_id = LLAMA_TOKEN_NULL; // repo llama_token special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator + llama_token special_image_id = LLAMA_TOKEN_NULL; + llama_token special_audio_id = LLAMA_TOKEN_NULL; // tokenizer flags bool add_space_prefix = false; @@ -1999,6 +2001,14 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false); } + const int image_idx = gguf_find_key(ctx,kv(LLM_KV_TOKENIZER_IMAGE_ID).c_str()); + if (image_idx != -1) { + special_image_id=gguf_get_val_u32(ctx,image_idx); + } + const int audio_idx = gguf_find_key(ctx,kv(LLM_KV_TOKENIZER_AUDIO_ID).c_str()); + if (audio_idx != -1) { + special_audio_id=gguf_get_val_u32(ctx,audio_idx); + } const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str()); if (token_idx == -1) { throw std::runtime_error("cannot find tokenizer vocab in model file\n"); @@ -2034,6 +2044,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { token_data.score = scores ? scores[i] : 0.0f; token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; + if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file switch(toktypes[i]) { case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break; @@ -2094,6 +2105,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { { LLM_KV_TOKENIZER_FIM_PAD_ID, special_fim_pad_id }, { LLM_KV_TOKENIZER_FIM_REP_ID, special_fim_rep_id }, { LLM_KV_TOKENIZER_FIM_SEP_ID, special_fim_sep_id }, + { LLM_KV_TOKENIZER_IMAGE_ID, special_image_id }, + { LLM_KV_TOKENIZER_AUDIO_ID, special_audio_id }, // deprecated { LLM_KV_TOKENIZER_PREFIX_ID, special_fim_pre_id }, @@ -2172,6 +2185,16 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { } } } + if (special_image_id==LLAMA_TOKEN_NULL) { + if (t.first=="<|IMAGE|>" || t.first=="") { + special_image_id=t.second; + } + } + if (special_audio_id==LLAMA_TOKEN_NULL) { + if (t.first=="<|AUDIO|>" || t.first=="