From db6f775c93848a56ccee8d53b0eaaaa76e53e409 Mon Sep 17 00:00:00 2001 From: HanishKVC Date: Sat, 20 Apr 2024 11:44:15 +0530 Subject: [PATCH 1/7] Common:ChatOn: Add arguments for chaton user needs to pass --chaton TEMPLATE_ID TEMPLATE_ID will be one of the predefined chat templates already in llama.cpp's llama_chat_apply_template_internal and related like chatml, llama2, llama3, ... --- common/common.cpp | 11 +++++++++++ common/common.h | 2 ++ 2 files changed, 13 insertions(+) diff --git a/common/common.cpp b/common/common.cpp index cf69535e2d1f5..b704d6f1f986a 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -868,6 +868,15 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.chatml = true; return true; } + if (arg == "--chaton") { + params.chaton = true; + if (++i >= argc) { + invalid_param = true; + return true; + } + params.chaton_template_id = argv[i]; + return true; + } if (arg == "--infill") { params.infill = true; return true; @@ -1378,6 +1387,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --version show version and build info\n"); printf(" -i, --interactive run in interactive mode\n"); printf(" --interactive-first run in interactive mode and wait for input right away\n"); + printf(" --chaton TEMPLATE_ID allow the interactive mode to apply the specified chat template before sending user input to model (you need to specify -i also)\n"); + printf(" TEMPLATE_ID could be chatml, llama3, ...\n"); printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n"); printf(" -cml, --chatml run in chatml mode (use with ChatML-compatible models)\n"); printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n"); diff --git a/common/common.h b/common/common.h index cca44268e6df5..931317c832153 100644 --- a/common/common.h +++ b/common/common.h @@ -139,6 +139,8 @@ struct gpt_params { bool use_color = false; // use color to distinguish generations and inputs bool interactive = false; // interactive mode bool chatml = false; // chatml mode (used for models trained on chatml syntax) + bool chaton = false; // chaton mode (used to chat with models which have been trained for chat and or instruct operation) + std::string chaton_template_id = ""; // the internal chat template to use bool prompt_cache_all = false; // save user input and generations to prompt cache bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it From efbcdc1cafa315782b4bb5330fa3672b319b4ad3 Mon Sep 17 00:00:00 2001 From: HanishKVC Date: Sat, 20 Apr 2024 11:58:15 +0530 Subject: [PATCH 2/7] Common:ChatOn: ReversePrompts, SingleMsgChatTemplate wrapper Helper to return reverse prompts needed for a given chat template A wrapper that will allow wrapping a given message within a tagged chat template based on the role and chat template specified. --- common/CMakeLists.txt | 1 + common/chaton.hpp | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100644 common/chaton.hpp diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 0ec8d6d8d03b5..fe865abab708d 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -65,6 +65,7 @@ add_library(${TARGET} STATIC train.cpp ngram-cache.h ngram-cache.cpp + chaton.hpp ) if (BUILD_SHARED_LIBS) diff --git a/common/chaton.hpp b/common/chaton.hpp new file mode 100644 index 0000000000000..62e82d658e7e7 --- /dev/null +++ b/common/chaton.hpp @@ -0,0 +1,40 @@ +#pragma once + +#include +#include + +#include "llama.h" +#include "log.h" + +inline std::string llama_chat_apply_template_simple( + const std::string & tmpl, + const std::string &role, + const std::string &content, + bool add_ass) { + llama_chat_message msg = { role.c_str(), content.c_str() }; + std::vector msgs{ msg }; + std::vector buf(content.size() * 2); + + int32_t slen = llama_chat_apply_template(nullptr, tmpl.c_str(), msgs.data(), msgs.size(), add_ass, buf.data(), buf.size()); + if ((size_t) slen > buf.size()) { + buf.resize(slen); + slen = llama_chat_apply_template(nullptr, tmpl.c_str(), msgs.data(), msgs.size(), add_ass, buf.data(), buf.size()); + } + + const std::string tagged_msg(buf.data(), slen); + LOGLN("INFO:%s:%s", __func__, tagged_msg.c_str()); + return tagged_msg; +} + +// return what should be the reverse prompt for the given template id +// ie possible end text tag(s) of specified model type's chat query response +std::vector llama_chat_reverse_prompt(std::string &template_id) { + std::vector rends; + + if (template_id == "chatml") { + rends.push_back("<|im_start|>user\n"); + } else if (template_id == "llama3") { + rends.push_back("<|eot_id|>"); + } + return rends; +} From 0a8797b28eacb3916f0b9be137d62c9725b66918 Mon Sep 17 00:00:00 2001 From: HanishKVC Date: Sat, 20 Apr 2024 18:40:55 +0530 Subject: [PATCH 3/7] Main:Update to support chaton mode Glanced through existing interactive and chatml flow, to incorporate this flow. Need to look deeper later. NOTE: Till this point is reapplying of my initial go at chaton, by simplifying the amount of change done to existing code, a bitmore. --- examples/main/main.cpp | 52 +++++++++++++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 13 deletions(-) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 249fc2bb605b3..a073a7bfdc3ad 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -1,4 +1,5 @@ #include "common.h" +#include "chaton.hpp" #include "console.h" #include "llama.h" @@ -251,11 +252,14 @@ int main(int argc, char ** argv) { std::vector embd_inp; - if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) { - LOG("tokenize the prompt\n"); + if (params.interactive_first || params.instruct || params.chatml || params.chaton || !params.prompt.empty() || session_tokens.empty()) { + LOG("tokenize the prompt: %s\n", params.prompt.c_str()); if (params.chatml) { params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>"; } + if (params.chaton) { + params.prompt = llama_chat_apply_template_simple(params.chaton_template_id, "system", params.prompt, false); + } embd_inp = ::llama_tokenize(ctx, params.prompt, true, true); } else { LOG("use session tokens\n"); @@ -333,7 +337,7 @@ int main(int argc, char ** argv) { } // number of tokens to keep when resetting context - if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct || params.chatml) { + if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct || params.chatml || params.chaton) { params.n_keep = (int)embd_inp.size(); } else { params.n_keep += add_bos; // always keep the BOS token @@ -363,6 +367,19 @@ int main(int argc, char ** argv) { params.interactive_first = true; params.antiprompt.emplace_back("<|im_start|>user\n"); } + // handle chaton mode, it adds on to any reverse prompt specified explicitly by the user + if (params.chaton) { + params.interactive_first = true; + std::vector resp_ends = llama_chat_reverse_prompt(params.chaton_template_id); + if (resp_ends.size() == 0) { + LOG_TEELN("ERRR:%s:ChatOn:Unsupported ChatType:%s", __func__, params.chaton_template_id.c_str()); + exit(1); + } + for (size_t i = 0; i < resp_ends.size(); i++) + { + params.antiprompt.emplace_back(resp_ends[i]); + } + } // enable interactive mode if interactive start is specified if (params.interactive_first) { @@ -817,7 +834,7 @@ int main(int argc, char ** argv) { if (n_past > 0 && is_interacting) { LOG("waiting for user input\n"); - if (params.instruct || params.chatml) { + if (params.instruct || params.chatml || params.chaton) { printf("\n> "); } @@ -876,15 +893,23 @@ int main(int argc, char ** argv) { process_escapes(buffer); } - const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true); - const auto line_inp = ::llama_tokenize(ctx, buffer, false, false); - const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true); - - LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); - - embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end()); - embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); - embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end()); + std::vector line_inp; + if (params.chaton) { + std::string f_chat = llama_chat_apply_template_simple(params.chaton_template_id, "user", buffer.c_str(), true); + line_inp = ::llama_tokenize(ctx, f_chat, false, true); + LOG("formatted input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); + embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); + } else { + const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true); + line_inp = ::llama_tokenize(ctx, buffer, false, false); + const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true); + + LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); + + embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end()); + embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); + embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end()); + } // instruct mode: insert response suffix if (params.instruct) { @@ -921,6 +946,7 @@ int main(int argc, char ** argv) { } // end of text token + // chaton expected to be used along with interactive argument, so not checking for chaton seperately if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive || params.chatml)) { LOG_TEE(" [end of text]\n"); break; From aac2ee6e9dbc0c14e99fbf7e3b86f88f5ecc561f Mon Sep 17 00:00:00 2001 From: HanishKVC Date: Sat, 20 Apr 2024 20:08:00 +0530 Subject: [PATCH 4/7] Common:ChatOn+Main:DBUG: Cleanup ChatTmplSimp, RevPrompt Llama2 This is a commit with dbug messages. ChatApplyTemplateSimple * wasnt handling unknown template ids properly, this is identified now and a warning logged, rather than trying to work with len of -1. Need to change to quit later. * Also avoid wrapping in a vector, as only a single message can be tagged wrt chat handshake template. ReversePrompt Add support for llama2 --- common/chaton.hpp | 18 +++++++++++++----- examples/main/main.cpp | 4 +++- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/common/chaton.hpp b/common/chaton.hpp index 62e82d658e7e7..da2834668554a 100644 --- a/common/chaton.hpp +++ b/common/chaton.hpp @@ -7,18 +7,24 @@ #include "log.h" inline std::string llama_chat_apply_template_simple( - const std::string & tmpl, + const std::string &tmpl, const std::string &role, const std::string &content, bool add_ass) { llama_chat_message msg = { role.c_str(), content.c_str() }; - std::vector msgs{ msg }; + //std::vector msgs{ msg }; std::vector buf(content.size() * 2); - int32_t slen = llama_chat_apply_template(nullptr, tmpl.c_str(), msgs.data(), msgs.size(), add_ass, buf.data(), buf.size()); + int32_t slen = llama_chat_apply_template(nullptr, tmpl.c_str(), &msg, 1, add_ass, buf.data(), buf.size()); + LOG_TEELN("DBUG:%s:AA:%s:LengthNeeded:%d:BufSizeWas:%zu", __func__, role.c_str(), slen, buf.size()); + if (slen == -1) { + LOG_TEELN("WARN:%s:Unknown template [%s] encounted", __func__, tmpl.c_str()); + return ""; + } if ((size_t) slen > buf.size()) { buf.resize(slen); - slen = llama_chat_apply_template(nullptr, tmpl.c_str(), msgs.data(), msgs.size(), add_ass, buf.data(), buf.size()); + slen = llama_chat_apply_template(nullptr, tmpl.c_str(), &msg, 1, add_ass, buf.data(), buf.size()); + LOG_TEELN("DBUG:%s:BB:%s:LengthNeeded:%d:BufSizeWas:%zu", __func__, role.c_str(), slen, buf.size()); } const std::string tagged_msg(buf.data(), slen); @@ -28,11 +34,13 @@ inline std::string llama_chat_apply_template_simple( // return what should be the reverse prompt for the given template id // ie possible end text tag(s) of specified model type's chat query response -std::vector llama_chat_reverse_prompt(std::string &template_id) { +inline std::vector llama_chat_reverse_prompt(std::string &template_id) { std::vector rends; if (template_id == "chatml") { rends.push_back("<|im_start|>user\n"); + } else if (template_id == "llama2") { + rends.push_back(""); } else if (template_id == "llama3") { rends.push_back("<|eot_id|>"); } diff --git a/examples/main/main.cpp b/examples/main/main.cpp index a073a7bfdc3ad..e04b26d36bf25 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -258,7 +258,9 @@ int main(int argc, char ** argv) { params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>"; } if (params.chaton) { + LOG_TEELN("DBUG:%s:AA:%s", __func__, params.prompt.c_str()); params.prompt = llama_chat_apply_template_simple(params.chaton_template_id, "system", params.prompt, false); + LOG_TEELN("DBUG:%s:BB:%s", __func__, params.prompt.c_str()); } embd_inp = ::llama_tokenize(ctx, params.prompt, true, true); } else { @@ -372,7 +374,7 @@ int main(int argc, char ** argv) { params.interactive_first = true; std::vector resp_ends = llama_chat_reverse_prompt(params.chaton_template_id); if (resp_ends.size() == 0) { - LOG_TEELN("ERRR:%s:ChatOn:Unsupported ChatType:%s", __func__, params.chaton_template_id.c_str()); + LOG_TEELN("ERRR:%s:ChatOn:Unsupported ChatTemplateType:%s", __func__, params.chaton_template_id.c_str()); exit(1); } for (size_t i = 0; i < resp_ends.size(); i++) From ca55da2b6fa676f9cd3c286e98063b9c5a2d9c41 Mon Sep 17 00:00:00 2001 From: HanishKVC Date: Sat, 20 Apr 2024 22:41:27 +0530 Subject: [PATCH 5/7] ChatOn+Main: ChatApplyTemplateSimple cleanup Cleanup the associated log messages. Dont overload the return for status as well as data. Now the data returned if any is kept independent of the status of the operation. On failure log a message and exit. --- common/chaton.hpp | 21 ++++++++++++--------- examples/main/main.cpp | 13 +++++++++---- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/common/chaton.hpp b/common/chaton.hpp index da2834668554a..91b3d480253d2 100644 --- a/common/chaton.hpp +++ b/common/chaton.hpp @@ -6,30 +6,33 @@ #include "llama.h" #include "log.h" -inline std::string llama_chat_apply_template_simple( +// Tag the passed message suitabley as expected by the specified chat handshake template +// and the role. If the specified template is not supported logic will return false. +inline bool llama_chat_apply_template_simple( const std::string &tmpl, const std::string &role, const std::string &content, + std::string &dst, bool add_ass) { llama_chat_message msg = { role.c_str(), content.c_str() }; - //std::vector msgs{ msg }; - std::vector buf(content.size() * 2); + std::vector buf(content.size() * 2); // This may under allot for small messages and over allot for large messages int32_t slen = llama_chat_apply_template(nullptr, tmpl.c_str(), &msg, 1, add_ass, buf.data(), buf.size()); - LOG_TEELN("DBUG:%s:AA:%s:LengthNeeded:%d:BufSizeWas:%zu", __func__, role.c_str(), slen, buf.size()); if (slen == -1) { - LOG_TEELN("WARN:%s:Unknown template [%s] encounted", __func__, tmpl.c_str()); - return ""; + LOG_TEELN("WARN:%s:Unknown template [%s] requested", __func__, tmpl.c_str()); + dst = ""; + return false; } if ((size_t) slen > buf.size()) { + LOGLN("INFO:%s:%s:LengthNeeded:%d:BufSizeWas:%zu", __func__, role.c_str(), slen, buf.size()); buf.resize(slen); slen = llama_chat_apply_template(nullptr, tmpl.c_str(), &msg, 1, add_ass, buf.data(), buf.size()); - LOG_TEELN("DBUG:%s:BB:%s:LengthNeeded:%d:BufSizeWas:%zu", __func__, role.c_str(), slen, buf.size()); } const std::string tagged_msg(buf.data(), slen); - LOGLN("INFO:%s:%s", __func__, tagged_msg.c_str()); - return tagged_msg; + LOGLN("INFO:%s:%s:%s", __func__, role.c_str(), tagged_msg.c_str()); + dst = tagged_msg; + return true; } // return what should be the reverse prompt for the given template id diff --git a/examples/main/main.cpp b/examples/main/main.cpp index e04b26d36bf25..dfd16670e0313 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -258,9 +258,10 @@ int main(int argc, char ** argv) { params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>"; } if (params.chaton) { - LOG_TEELN("DBUG:%s:AA:%s", __func__, params.prompt.c_str()); - params.prompt = llama_chat_apply_template_simple(params.chaton_template_id, "system", params.prompt, false); - LOG_TEELN("DBUG:%s:BB:%s", __func__, params.prompt.c_str()); + if (!llama_chat_apply_template_simple(params.chaton_template_id, "system", params.prompt, params.prompt, false)) { + LOG_TEELN("ERRR:%s:Wrt:%s:%s:%s", __func__, params.chaton_template_id.c_str(), "system", params.prompt.c_str()); + exit(2); + } } embd_inp = ::llama_tokenize(ctx, params.prompt, true, true); } else { @@ -897,7 +898,11 @@ int main(int argc, char ** argv) { std::vector line_inp; if (params.chaton) { - std::string f_chat = llama_chat_apply_template_simple(params.chaton_template_id, "user", buffer.c_str(), true); + std::string f_chat; + if (!llama_chat_apply_template_simple(params.chaton_template_id, "user", buffer.c_str(), f_chat, true)) { + LOG_TEELN("ERRR:%s:Wrt:%s:%s:%s", __func__, params.chaton_template_id.c_str(), "user", params.prompt.c_str()); + exit(2); + } line_inp = ::llama_tokenize(ctx, f_chat, false, true); LOG("formatted input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); From e23b5c8689d5aea56e1f6ecad2eaee66e3aa8081 Mon Sep 17 00:00:00 2001 From: HanishKVC Date: Sat, 20 Apr 2024 23:26:16 +0530 Subject: [PATCH 6/7] ChatOn+Main: Cleanup the Requested ChatOn ReversePrompt handling Avoid the use of the seperate vector, which inturn is copied to the main vector on return. Now directly pass the main reverse prompt vector and inturn directly add to passed vector. Also keep data and return status seperate. Explicitly identify a unknown template_id situation and return failure status. --- common/chaton.hpp | 18 ++++++++++-------- examples/main/main.cpp | 7 +------ 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/common/chaton.hpp b/common/chaton.hpp index 91b3d480253d2..b33027669e018 100644 --- a/common/chaton.hpp +++ b/common/chaton.hpp @@ -36,16 +36,18 @@ inline bool llama_chat_apply_template_simple( } // return what should be the reverse prompt for the given template id -// ie possible end text tag(s) of specified model type's chat query response -inline std::vector llama_chat_reverse_prompt(std::string &template_id) { - std::vector rends; - +// ie possible end text tag(s) of specified model type's chat query response. +// Note that It adds these reverse prompts to any that may already exist in the passed vector. +inline bool llama_chat_reverse_prompt(std::string &template_id, std::vector &rprompts) { if (template_id == "chatml") { - rends.push_back("<|im_start|>user\n"); + rprompts.push_back("<|im_start|>user\n"); } else if (template_id == "llama2") { - rends.push_back(""); + rprompts.push_back(""); } else if (template_id == "llama3") { - rends.push_back("<|eot_id|>"); + rprompts.push_back("<|eot_id|>"); + } else { + LOG_TEELN("WARN:%s:Unknown template [%s] requested", __func__, template_id.c_str()); + return false; } - return rends; + return true; } diff --git a/examples/main/main.cpp b/examples/main/main.cpp index dfd16670e0313..32bcee9c43199 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -373,15 +373,10 @@ int main(int argc, char ** argv) { // handle chaton mode, it adds on to any reverse prompt specified explicitly by the user if (params.chaton) { params.interactive_first = true; - std::vector resp_ends = llama_chat_reverse_prompt(params.chaton_template_id); - if (resp_ends.size() == 0) { + if (!llama_chat_reverse_prompt(params.chaton_template_id, params.antiprompt)) { LOG_TEELN("ERRR:%s:ChatOn:Unsupported ChatTemplateType:%s", __func__, params.chaton_template_id.c_str()); exit(1); } - for (size_t i = 0; i < resp_ends.size(); i++) - { - params.antiprompt.emplace_back(resp_ends[i]); - } } // enable interactive mode if interactive start is specified From 9037892127cf01707cc92a4a242f79e3b9fbffa4 Mon Sep 17 00:00:00 2001 From: HanishKVC Date: Sat, 20 Apr 2024 23:42:25 +0530 Subject: [PATCH 7/7] ChatON: Add a note --- common/chaton.hpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/common/chaton.hpp b/common/chaton.hpp index b33027669e018..9616dea407c69 100644 --- a/common/chaton.hpp +++ b/common/chaton.hpp @@ -1,5 +1,21 @@ #pragma once +/** + * + * Provides a simple and dumb helpers which help chat with llm chat/instruct models + * using the chat template expected by them. + * + * Normally used to tag system prompt and user messages. + * Currently used by example/main programs. + * + * This builds on the llama_chat_apply_template. When adding support for new chat templates + * remember to update llama_chat_apply_template_internal as well as llama_chat_reverse_prompt. + * + * example/main program uses this when --chaton TEMPLATE_ID is passed to it along with -i + * sample TEMPLATE_ID's include chatml, llama2, llama3, ... + * + */ + #include #include