Skip to content

Commit 474435f

Browse files
KawrakowIwan Kawrakow
andauthored
LlaMA-4 support (text only) (ikawrakow#321)
* llama4: WIP * llama4: this seems to be working --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
1 parent 5f44f4b commit 474435f

File tree

3 files changed

+295
-21
lines changed

3 files changed

+295
-21
lines changed

include/llama.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,12 @@ extern "C" {
100100
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
101101
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
102102
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
103-
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 23, //llama.cpp lists this as 28
103+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28, //llama.cpp lists this as 28
104+
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
105+
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
106+
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
107+
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
108+
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
104109
};
105110

106111
// note: these values should be synchronized with ggml_rope

src/llama-vocab.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -439,6 +439,27 @@ struct llm_tokenizer_bpe {
439439
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
440440
};
441441
break;
442+
case LLAMA_VOCAB_PRE_TYPE_GPT4O:
443+
regex_exprs = {
444+
// original regex from tokenizer.json
445+
// "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
446+
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
447+
};
448+
break;
449+
case LLAMA_VOCAB_PRE_TYPE_SUPERBPE:
450+
regex_exprs = {
451+
"\\p{N}+",
452+
"(?=(\\d{3})+(?!\\d))",
453+
};
454+
break;
455+
case LLAMA_VOCAB_PRE_TYPE_BAILINGMOE:
456+
regex_exprs = {
457+
// original regex from tokenizer.json
458+
// "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
459+
// FIXME? Changed possessive quantifiers (?+ and ++) to greedy to avoid errors and imatrix hanging (tried atomic grouping but it's not supported?)
460+
"'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
461+
};
462+
break;
442463
default:
443464
// default regex for BPE tokenization pre-processing
444465
regex_exprs = {

0 commit comments

Comments
 (0)