@@ -4595,20 +4595,14 @@ static void llm_load_vocab(
4595
4595
vocab.special_cls_id = 101;
4596
4596
vocab.special_mask_id = 103;
4597
4597
vocab.add_space_prefix = false;
4598
- } else {
4599
- if (tokenizer_model == "gpt2") {
4600
- vocab.type = LLAMA_VOCAB_TYPE_BPE;
4598
+ } else if (tokenizer_model == "gpt2") {
4599
+ vocab.type = LLAMA_VOCAB_TYPE_BPE;
4601
4600
4602
- const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4603
- if (add_space_prefix_keyidx != -1) {
4604
- vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4605
- }
4606
- } else {
4607
- LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
4608
- LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
4609
- vocab.type = LLAMA_VOCAB_TYPE_SPM;
4610
- return;
4601
+ const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4602
+ if (add_space_prefix_keyidx != -1) {
4603
+ vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4611
4604
}
4605
+
4612
4606
// read bpe merges and populate bpe ranks
4613
4607
const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
4614
4608
if (merges_keyidx == -1) {
@@ -4642,6 +4636,8 @@ static void llm_load_vocab(
4642
4636
vocab.special_pad_id = -1;
4643
4637
vocab.special_cls_id = -1;
4644
4638
vocab.special_mask_id = -1;
4639
+ } else {
4640
+ throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
4645
4641
}
4646
4642
4647
4643
// for now, only BPE models have pre-tokenizers
0 commit comments