Skip to content

Commit 1494a18

Browse files
committed
llama : throw on unknown tokenizer types
ggml-ci
1 parent 21ccd64 commit 1494a18

File tree

1 file changed

+8
-12
lines changed

1 file changed

+8
-12
lines changed

llama.cpp

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4595,20 +4595,14 @@ static void llm_load_vocab(
45954595
vocab.special_cls_id = 101;
45964596
vocab.special_mask_id = 103;
45974597
vocab.add_space_prefix = false;
4598-
} else {
4599-
if (tokenizer_model == "gpt2") {
4600-
vocab.type = LLAMA_VOCAB_TYPE_BPE;
4598+
} else if (tokenizer_model == "gpt2") {
4599+
vocab.type = LLAMA_VOCAB_TYPE_BPE;
46014600

4602-
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4603-
if (add_space_prefix_keyidx != -1) {
4604-
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4605-
}
4606-
} else {
4607-
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
4608-
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
4609-
vocab.type = LLAMA_VOCAB_TYPE_SPM;
4610-
return;
4601+
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4602+
if (add_space_prefix_keyidx != -1) {
4603+
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
46114604
}
4605+
46124606
// read bpe merges and populate bpe ranks
46134607
const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
46144608
if (merges_keyidx == -1) {
@@ -4642,6 +4636,8 @@ static void llm_load_vocab(
46424636
vocab.special_pad_id = -1;
46434637
vocab.special_cls_id = -1;
46444638
vocab.special_mask_id = -1;
4639+
} else {
4640+
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
46454641
}
46464642

46474643
// for now, only BPE models have pre-tokenizers

0 commit comments

Comments
 (0)