Skip to content

Commit a2a7403

Browse files
ggerganoviThalay
authored andcommitted
talk-llama : sync llama.cpp
1 parent ab1e794 commit a2a7403

File tree

7 files changed

+3331
-740
lines changed

7 files changed

+3331
-740
lines changed

examples/talk-llama/llama.cpp

Lines changed: 3266 additions & 724 deletions
Large diffs are not rendered by default.

examples/talk-llama/llama.h

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ extern "C" {
6767
LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
6868
LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
6969
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
70+
LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
7071
};
7172

7273
// pre-tokenization types
@@ -87,6 +88,10 @@ extern "C" {
8788
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
8889
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
8990
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
91+
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
92+
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
93+
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
94+
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
9095
};
9196

9297
// note: these values should be synchronized with ggml_rope
@@ -177,6 +182,12 @@ extern "C" {
177182
LLAMA_POOLING_TYPE_LAST = 3,
178183
};
179184

185+
enum llama_attention_type {
186+
LLAMA_ATTENTION_TYPE_UNSPECIFIED = -1,
187+
LLAMA_ATTENTION_TYPE_CAUSAL = 0,
188+
LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1,
189+
};
190+
180191
enum llama_split_mode {
181192
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
182193
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
@@ -294,6 +305,7 @@ extern "C" {
294305

295306
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
296307
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
308+
enum llama_attention_type attention_type; // attention type to use for embeddings
297309

298310
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
299311
float rope_freq_base; // RoPE base frequency, 0 = from model
@@ -482,6 +494,13 @@ extern "C" {
482494
// Get a llama model tensor
483495
LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
484496

497+
// Returns true if the model contains an encoder that requires llama_encode() call
498+
LLAMA_API bool llama_model_has_encoder(const struct llama_model * model);
499+
500+
// For encoder-decoder models, this function returns id of the token that must be provided
501+
// to the decoder to start generating output sequence. For other models, it returns -1.
502+
LLAMA_API llama_token llama_model_decoder_start_token(const struct llama_model * model);
503+
485504
// Returns 0 on success
486505
LLAMA_API uint32_t llama_model_quantize(
487506
const char * fname_inp,
@@ -767,6 +786,14 @@ extern "C" {
767786
// Frees a batch of tokens allocated with llama_batch_init()
768787
LLAMA_API void llama_batch_free(struct llama_batch batch);
769788

789+
// Processes a batch of tokens with the ecoder part of the encoder-decoder model.
790+
// Stores the encoder output internally for later use by the decoder cross-attention layers.
791+
// 0 - success
792+
// < 0 - error
793+
LLAMA_API int32_t llama_encode(
794+
struct llama_context * ctx,
795+
struct llama_batch batch);
796+
770797
// Positive return values does not mean a fatal error, but rather a warning.
771798
// 0 - success
772799
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
@@ -857,6 +884,7 @@ extern "C" {
857884
LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
858885
LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
859886
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
887+
LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
860888

861889
// Returns -1 if unknown, 1 for true or 0 for false.
862890
LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model);
@@ -878,6 +906,7 @@ extern "C" {
878906
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
879907
/// @return Returns the number of tokens on success, no more than n_tokens_max
880908
/// @return Returns a negative number on failure - the number of tokens that would have been returned
909+
/// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
881910
/// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
882911
/// as plaintext. Does not insert a leading space.
883912
LLAMA_API int32_t llama_tokenize(
@@ -892,15 +921,31 @@ extern "C" {
892921
// Token Id -> Piece.
893922
// Uses the vocabulary in the provided context.
894923
// Does not write null terminator to the buffer.
895-
// User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
924+
// User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix')
896925
// @param special If true, special tokens are rendered in the output.
897926
LLAMA_API int32_t llama_token_to_piece(
898927
const struct llama_model * model,
899928
llama_token token,
900929
char * buf,
901930
int32_t length,
931+
int32_t lstrip,
902932
bool special);
903933

934+
/// @details Convert the provided tokens into text (inverse of llama_tokenize()).
935+
/// @param text The char pointer must be large enough to hold the resulting text.
936+
/// @return Returns the number of chars/bytes on success, no more than text_len_max.
937+
/// @return Returns a negative number on failure - the number of chars/bytes that would have been returned.
938+
/// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so.
939+
/// @param unparse_special If true, special tokens are rendered in the output.
940+
LLAMA_API int32_t llama_detokenize(
941+
const struct llama_model * model,
942+
const llama_token * tokens,
943+
int32_t n_tokens,
944+
char * text,
945+
int32_t text_len_max,
946+
bool remove_special,
947+
bool unparse_special);
948+
904949
/// Apply chat template. Inspired by hf apply_chat_template() on python.
905950
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
906951
/// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
@@ -924,6 +969,12 @@ extern "C" {
924969
// Grammar
925970
//
926971

972+
/// Initialize a llama_grammar.
973+
///
974+
/// @param rules The rule elements of the grammar to initialize.
975+
/// @param n_rules The number of rules.
976+
/// @param start_rule_index The index of the root rule (the starting point of the grammar).
977+
/// @return The initialized llama_grammar or nullptr if initialization failed.
927978
LLAMA_API struct llama_grammar * llama_grammar_init(
928979
const llama_grammar_element ** rules,
929980
size_t n_rules,

examples/talk-llama/talk-llama.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,10 @@ static std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const
3535

3636
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
3737
std::vector<char> result(8, 0);
38-
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), false);
38+
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), 0, false);
3939
if (n_tokens < 0) {
4040
result.resize(-n_tokens);
41-
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), false);
41+
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), 0, false);
4242
GGML_ASSERT(check == -n_tokens);
4343
} else {
4444
result.resize(n_tokens);

examples/talk-llama/unicode-data.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7030,4 +7030,3 @@ const std::vector<range_nfd> unicode_ranges_nfd = { // start, last, nfd
70307030
{0x02FA1C, 0x02FA1C, 0x009F3B},
70317031
{0x02FA1D, 0x02FA1D, 0x02A600},
70327032
};
7033-

examples/talk-llama/unicode.cpp

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
2323
return result;
2424
}
2525

26-
static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
26+
uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
2727
assert(offset < utf8.size());
2828
if (!(utf8[offset + 0] & 0x80)) {
2929
auto result = utf8[offset + 0];
@@ -232,8 +232,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
232232
};
233233

234234
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
235-
static const codepoint_flags undef(codepoint_flags::UNDEFINED);
236-
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef;
235+
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
237236
};
238237

239238
size_t _prev_end = offset_ini;
@@ -295,9 +294,9 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
295294
continue;
296295
}
297296
// regex: <space>?[^\s\p{L}\p{N}]+
298-
if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
297+
if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
299298
pos += (cpt == ' ');
300-
while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
299+
while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
301300
flags2 = _get_flags(++pos);
302301
}
303302
_add_token(pos);
@@ -351,8 +350,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
351350
};
352351

353352
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
354-
static const codepoint_flags undef(codepoint_flags::UNDEFINED);
355-
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef;
353+
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
356354
};
357355

358356
size_t _prev_end = offset_ini;
@@ -394,8 +392,8 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
394392
}
395393
}
396394

397-
// regex: [^\r\n\p{L}\p{N}]?\p{L}+ //####FIXME: the first \p{L} is correct?
398-
if (!(cpt == '\r' || cpt == '\n' || /*flags.is_letter |*/ flags.is_number)) {
395+
// regex: [^\r\n\p{L}\p{N}]?\p{L}+
396+
if (!(cpt == '\r' || cpt == '\n' || flags.is_number)) {
399397
if (flags.is_letter || _get_flags(pos+1).is_letter) { // one or more letters
400398
pos++;
401399
while (_get_flags(pos).is_letter) {
@@ -421,9 +419,9 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
421419

422420
// regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
423421
auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
424-
if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
422+
if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags.as_uint()) {
425423
pos += (cpt == ' ');
426-
while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
424+
while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
427425
flags2 = _get_flags(++pos);
428426
}
429427
uint32_t cpt2 = _get_cpt(pos);

examples/talk-llama/unicode.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ struct codepoint_flags {
4848

4949

5050
std::string unicode_cpt_to_utf8(uint32_t cp);
51+
uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
5152
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
5253

5354
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);

src/whisper.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2949,7 +2949,7 @@ struct whisper_global_cache {
29492949
// Mel spectrogram
29502950

29512951
void whisper_mel_init(whisper_mel & mel, ggml_backend_t backend, int n_len, int n_len_org, int n_mel) {
2952-
WHISPER_LOG_INFO("%s: n_len = %d, n_len_org = %d, n_mel = %d\n", __func__, n_len, n_len_org, n_mel);
2952+
//WHISPER_LOG_INFO("%s: n_len = %d, n_len_org = %d, n_mel = %d\n", __func__, n_len, n_len_org, n_mel);
29532953
mel.n_len_org = n_len_org;
29542954
assert(!mel.ctx);
29552955
mel.ctx = ggml_init({ggml_tensor_overhead(), nullptr, true});

0 commit comments

Comments
 (0)