From 6303ea26401d7be51f5bf60113530dc683b450b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Wed, 21 May 2025 20:17:35 +0200 Subject: [PATCH 01/13] initial jina-embeddings-v3 support --- gguf-py/gguf/constants.py | 14 ++++++++++++++ gguf-py/gguf/tensor_mapping.py | 2 ++ 2 files changed, 16 insertions(+) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 7aed8b83ec378..618f871804b21 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -272,6 +272,7 @@ class MODEL_ARCH(IntEnum): NOMIC_BERT = auto() NOMIC_BERT_MOE = auto() JINA_BERT_V2 = auto() + JINA_BERT_V3 = auto() BLOOM = auto() STABLELM = auto() QWEN = auto() @@ -534,6 +535,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.NOMIC_BERT: "nomic-bert", MODEL_ARCH.NOMIC_BERT_MOE: "nomic-bert-moe", MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2", + MODEL_ARCH.JINA_BERT_V3: "jina-bert-v3", MODEL_ARCH.BLOOM: "bloom", MODEL_ARCH.STABLELM: "stablelm", MODEL_ARCH.QWEN: "qwen", @@ -1020,6 +1022,18 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.LAYER_OUT_NORM, MODEL_TENSOR.CLS, ], + MODEL_ARCH.JINA_BERT_V3: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.TOKEN_EMBD_NORM, + MODEL_TENSOR.TOKEN_TYPES, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_OUT_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.LAYER_OUT_NORM, + ], MODEL_ARCH.MPT: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 0298f8b460a3f..b6eb770d8aa0f 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -157,6 +157,7 @@ class TensorNameMap: "h.{bid}.attn.c_attn", # gpt2 "transformer.h.{bid}.mixer.Wqkv", # phi2 "encoder.layers.{bid}.attn.Wqkv", # nomic-bert + "encoder.layers.{bid}.mixer.Wqkv", # jina-bert-v3 "model.layers.{bid}.self_attn.qkv_proj", # phi3 "encoder.layers.{bid}.self_attention.query_key_value", # chatglm "transformer.layers.{bid}.attn.qkv_proj", # openelm @@ -224,6 +225,7 @@ class TensorNameMap: "model.layers.layers.{bid}.self_attn.o_proj", # plamo "model.layers.{bid}.attention.wo", # internlm2 "encoder.layers.{bid}.attn.out_proj", # nomic-bert + "encoder.layers.{bid}.mixer.out_proj", # jina-bert-v3 "transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx "encoder.layers.{bid}.self_attention.dense", # chatglm From ba51f891bce132c2fdfdac175f8c78ec863627f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Wed, 21 May 2025 20:18:10 +0200 Subject: [PATCH 02/13] initial jina-embeddings-v3 support --- convert_hf_to_gguf.py | 44 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index e88076ccccd2d..12fd6a3643cf0 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -802,6 +802,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec": # ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base res = "seed-coder" + if chkhsh == "a81863d07e75497e2194eb1a1574d5e5cd4d5f85a87a0728b922bf2bed6fb327": + # ref: https://huggingface.co/jinaai/jina-embeddings-v3 + res = "jina-v3" if res is None: logger.warning("\n") @@ -3829,12 +3832,24 @@ def _is_tokenizer_xlmroberta(self) -> bool: class XLMRobertaModel(BertModel): model_arch = gguf.MODEL_ARCH.BERT - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self._xlmroberta_tokenizer_init() + def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any): + hparams = kwargs.pop("hparams", None) + if hparams is None: + hparams = ModelBase.load_hparams(dir_model) + + if hparams.get("lora_adaptations"): + self.model_arch = gguf.MODEL_ARCH.JINA_BERT_V3 + + super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs) + + self._tokenizer_is_xlmroberta = False if self.model_arch == gguf.MODEL_ARCH.JINA_BERT_V3 else True + if self._tokenizer_is_xlmroberta: + self._xlmroberta_tokenizer_init() def set_vocab(self): - self._xlmroberta_set_vocab() + if self._tokenizer_is_xlmroberta: + return self._xlmroberta_set_vocab() + return super().set_vocab() def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # if name starts with "roberta.", remove the prefix @@ -3842,13 +3857,34 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.startswith("roberta."): name = name[8:] + # jina-embeddings-v3 + if ".parametrizations." in name: + name = name.replace(".parametrizations.", ".") + if name.endswith(".original"): + name = name[:-9] + # position embeddings start at pad_token_id + 1, so just chop down the weight tensor if name == "embeddings.position_embeddings.weight": if self._position_offset is not None: data_torch = data_torch[self._position_offset:,:] + if name.endswith(".lora_A"): + # TODO: convert loras + return [] + + if name.endswith(".lora_B"): + # TODO: convert loras + return [] + return super().modify_tensors(data_torch, name, bid) + def set_gguf_parameters(self): + super().set_gguf_parameters() + + # jina-embeddings-v3 + if rotary_emb_base := self.hparams.get("rotary_emb_base"): + self.gguf_writer.add_rope_freq_base(rotary_emb_base) + @ModelBase.register("GemmaForCausalLM") class GemmaModel(TextModel): From 4ac1380531dc0b74d1a1e7d14e273216086aa5e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Wed, 21 May 2025 20:20:02 +0200 Subject: [PATCH 03/13] initial jina-embeddings-v3 support --- src/llama-arch.cpp | 15 ++++++++++++++ src/llama-arch.h | 1 + src/llama-model.cpp | 50 +++++++++++++++++++++++++++++++++++++++++++-- src/llama-model.h | 1 + 4 files changed, 65 insertions(+), 2 deletions(-) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index abf436adac416..19f58ba82db3d 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -21,6 +21,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_NOMIC_BERT, "nomic-bert" }, { LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" }, { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" }, + { LLM_ARCH_JINA_BERT_V3, "jina-bert-v3" }, { LLM_ARCH_BLOOM, "bloom" }, { LLM_ARCH_STABLELM, "stablelm" }, { LLM_ARCH_QWEN, "qwen" }, @@ -513,6 +514,20 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_CLS, "cls" }, }, }, + { + LLM_ARCH_JINA_BERT_V3, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, + { LLM_TENSOR_TOKEN_TYPES, "token_types" }, + { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, + }, + }, { LLM_ARCH_BLOOM, { diff --git a/src/llama-arch.h b/src/llama-arch.h index 41a023da3da6e..c696b4113a4c7 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -25,6 +25,7 @@ enum llm_arch { LLM_ARCH_NOMIC_BERT, LLM_ARCH_NOMIC_BERT_MOE, LLM_ARCH_JINA_BERT_V2, + LLM_ARCH_JINA_BERT_V3, LLM_ARCH_BLOOM, LLM_ARCH_STABLELM, LLM_ARCH_QWEN, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 82557ea054bb2..b0512bf44a34d 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -41,6 +41,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_410M: return "410M"; case LLM_TYPE_450M: return "450M"; case LLM_TYPE_475M: return "475M"; + case LLM_TYPE_558M: return "558M"; case LLM_TYPE_770M: return "770M"; case LLM_TYPE_780M: return "780M"; case LLM_TYPE_0_5B: return "0.5B"; @@ -710,6 +711,18 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_JINA_BERT_V3: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); + ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); + + switch (hparams.n_layer) { + case 24: + type = LLM_TYPE_558M; break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; case LLM_ARCH_NOMIC_BERT: case LLM_ARCH_NOMIC_BERT_MOE: { @@ -2215,6 +2228,36 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0); + layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0); + layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0); + } + } break; + case LLM_ARCH_JINA_BERT_V3: + { + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); + + tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); + tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0); + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0); + layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0); + + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); + layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); + + layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); + layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0); + + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0); + + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); + layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0); + layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0); layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0); } @@ -5931,7 +5974,7 @@ struct llm_build_bert : public llm_graph_context { cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); - if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) { + if (model.arch == LLM_ARCH_NOMIC_BERT_MOE || model.arch == LLM_ARCH_JINA_BERT_V3) { cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cb(cur, "bqkv", il); } @@ -6003,7 +6046,7 @@ struct llm_build_bert : public llm_graph_context { 0.0f, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); cb(cur, "ffn_moe_out", il); - } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) { + } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE || model.arch == LLM_ARCH_JINA_BERT_V3) { cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, @@ -13187,6 +13230,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, switch (arch) { case LLM_ARCH_BERT: case LLM_ARCH_JINA_BERT_V2: + case LLM_ARCH_JINA_BERT_V3: case LLM_ARCH_NOMIC_BERT: case LLM_ARCH_NOMIC_BERT_MOE: { @@ -13292,6 +13336,7 @@ llm_graph_result_ptr llama_model::build_graph( } break; case LLM_ARCH_BERT: case LLM_ARCH_JINA_BERT_V2: + case LLM_ARCH_JINA_BERT_V3: case LLM_ARCH_NOMIC_BERT: case LLM_ARCH_NOMIC_BERT_MOE: { @@ -13658,6 +13703,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_GROK: case LLM_ARCH_DBRX: case LLM_ARCH_BERT: + case LLM_ARCH_JINA_BERT_V3: case LLM_ARCH_NOMIC_BERT: case LLM_ARCH_NOMIC_BERT_MOE: case LLM_ARCH_STABLELM: diff --git a/src/llama-model.h b/src/llama-model.h index cbea2cb331b62..77e36f12f252b 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -37,6 +37,7 @@ enum llm_type { LLM_TYPE_410M, LLM_TYPE_450M, LLM_TYPE_475M, + LLM_TYPE_558M, LLM_TYPE_770M, LLM_TYPE_780M, LLM_TYPE_0_5B, From 1274c8c357102f123d4262677120707cd3b0624a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Thu, 22 May 2025 11:55:03 +0200 Subject: [PATCH 04/13] fix vocab parsing with only tokenizer.json --- convert_hf_to_gguf.py | 132 ++++++++++++++++++++++++++++-------------- 1 file changed, 87 insertions(+), 45 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 12fd6a3643cf0..bf7915a932ed9 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -802,9 +802,6 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec": # ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base res = "seed-coder" - if chkhsh == "a81863d07e75497e2194eb1a1574d5e5cd4d5f85a87a0728b922bf2bed6fb327": - # ref: https://huggingface.co/jinaai/jina-embeddings-v3 - res = "jina-v3" if res is None: logger.warning("\n") @@ -3626,44 +3623,93 @@ def _xlmroberta_set_vocab(self) -> None: from sentencepiece import sentencepiece_model_pb2 as model tokenizer_path = self.dir_model / 'sentencepiece.bpe.model' + + tokenizer_json = {} + tokenizer_config_json = {} if not tokenizer_path.is_file(): - raise FileNotFoundError(f"File not found: {tokenizer_path}") + tokenizer_path = self.dir_model / 'tokenizer.json' + tokenizer_config_path = self.dir_model / 'tokenizer_config.json' - sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] - sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) - assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM + if not tokenizer_path.is_file(): + raise FileNotFoundError(f"File not found: {tokenizer_path}") - add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix - remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces - precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap + from base64 import b64decode + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model) - tokenizer = SentencePieceProcessor() - tokenizer.LoadFromFile(str(tokenizer_path)) + with open(tokenizer_path, "r", encoding="utf-8") as fp: + tokenizer_json = json.load(fp) - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + if tokenizer_config_path.is_file(): + with open(tokenizer_config_path, "r", encoding="utf-8") as fp: + tokenizer_config_json = json.load(fp) + + add_prefix = tokenizer.add_prefix_space + remove_whitespaces = tokenizer.clean_up_tokenization_spaces + precompiled_charsmap = b64decode(tokenizer_json["normalizer"]["precompiled_charsmap"]) + + vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size) + else: + sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] + sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) + assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM + + add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix + remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces + precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap + + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] scores: list[float] = [-10000.0] * vocab_size toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size - for token_id in range(tokenizer.vocab_size()): - piece = tokenizer.IdToPiece(token_id) - text = piece.encode("utf-8") - score = tokenizer.GetScore(token_id) + if isinstance(tokenizer, SentencePieceProcessor): + for token_id in range(vocab_size): + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.IsUnknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.IsControl(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.IsUnused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.IsByte(token_id): - toktype = SentencePieceTokenTypes.BYTE + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE - tokens[token_id] = text - scores[token_id] = score - toktypes[token_id] = toktype + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + else: + added_vocab = tokenizer.get_added_vocab() + unk_token = tokenizer_config_json.get("unk_token") + unk_token_id = added_vocab.get(unk_token, 3) + + for token_id in range(vocab_size): + piece = tokenizer._convert_id_to_token(token_id) + text = piece.encode("utf-8") + score = tokenizer_json["model"]["vocab"][token_id][1] + + toktype = SentencePieceTokenTypes.NORMAL + if token_id == unk_token_id: + toktype = SentencePieceTokenTypes.UNKNOWN + elif token_id in tokenizer.all_special_ids: + toktype = SentencePieceTokenTypes.CONTROL + elif token_id in added_vocab.values(): + toktype = SentencePieceTokenTypes.USER_DEFINED + # No reliable way to detect this, but jina-embeddings-v3 doesn't have any + # elif tokenizer.IsByte(token_id): + # toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype if vocab_size > len(tokens): pad_count = vocab_size - len(tokens) @@ -3673,15 +3719,16 @@ def _xlmroberta_set_vocab(self) -> None: scores.append(-1000.0) toktypes.append(SentencePieceTokenTypes.UNUSED) - # realign tokens (see HF tokenizer code) - tokens = [b'', b'', b'', b''] + tokens[3:-1] - scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1] - toktypes = [ - SentencePieceTokenTypes.CONTROL, - SentencePieceTokenTypes.CONTROL, - SentencePieceTokenTypes.CONTROL, - SentencePieceTokenTypes.UNKNOWN, - ] + toktypes[3:-1] + if isinstance(tokenizer, SentencePieceProcessor): + # realign tokens (see HF tokenizer code) + tokens = [b'', b'', b'', b''] + tokens[3:-1] + scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1] + toktypes = [ + SentencePieceTokenTypes.CONTROL, + SentencePieceTokenTypes.CONTROL, + SentencePieceTokenTypes.CONTROL, + SentencePieceTokenTypes.UNKNOWN, + ] + toktypes[3:-1] self.gguf_writer.add_tokenizer_model("t5") self.gguf_writer.add_tokenizer_pre("default") @@ -3841,15 +3888,10 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, self.model_arch = gguf.MODEL_ARCH.JINA_BERT_V3 super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs) - - self._tokenizer_is_xlmroberta = False if self.model_arch == gguf.MODEL_ARCH.JINA_BERT_V3 else True - if self._tokenizer_is_xlmroberta: - self._xlmroberta_tokenizer_init() + self._xlmroberta_tokenizer_init() def set_vocab(self): - if self._tokenizer_is_xlmroberta: - return self._xlmroberta_set_vocab() - return super().set_vocab() + self._xlmroberta_set_vocab() def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # if name starts with "roberta.", remove the prefix From 65a37fa8e5f83afd36dd84ea9dbafb7c5e56388a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Thu, 22 May 2025 15:19:04 +0200 Subject: [PATCH 05/13] set mask token lstrip attribute --- src/llama-vocab.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 9389ca805a584..6d6a1c6e18782 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -2080,9 +2080,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { std::string model_name; std::string tokenizer_pre; + std::string general_arch; ml.get_key(LLM_KV_GENERAL_NAME, model_name, false); ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false); + ml.get_key(LLM_KV_GENERAL_ARCHITECTURE, general_arch, false); // model name to lowercase std::transform(model_name.begin(), model_name.end(), model_name.begin(), @@ -2091,8 +2093,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { } ); - // set attributes by model/tokenizer name - if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) { + // set attributes by model/tokenizer/architecture name + if (false + || _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"}) + || _contains_any(general_arch, {"jina-bert-v3"}) + ) { _set_token_attr("", LLAMA_TOKEN_ATTR_LSTRIP, true); } else if (_contains_any(model_name, {"phi-3", "phi3"})) { for (auto id : cache_special_tokens) { From f2d876ad5ab5013d4df5848038e18e2f8b51760a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Fri, 23 May 2025 09:22:45 +0200 Subject: [PATCH 06/13] additional unk_token_id fallback just in case [no ci] --- convert_hf_to_gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index bf7915a932ed9..0f2c41ecc8b3d 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3689,7 +3689,7 @@ def _xlmroberta_set_vocab(self) -> None: else: added_vocab = tokenizer.get_added_vocab() unk_token = tokenizer_config_json.get("unk_token") - unk_token_id = added_vocab.get(unk_token, 3) + unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3)) for token_id in range(vocab_size): piece = tokenizer._convert_id_to_token(token_id) From b17e9811f47adc32682042a238219ea3b37d7e32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Mon, 26 May 2025 08:40:46 +0200 Subject: [PATCH 07/13] revert vocab_size() change [no ci] --- convert_hf_to_gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 0f2c41ecc8b3d..753c88e7c5f17 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3668,7 +3668,7 @@ def _xlmroberta_set_vocab(self) -> None: toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size if isinstance(tokenizer, SentencePieceProcessor): - for token_id in range(vocab_size): + for token_id in range(tokenizer.vocab_size()): piece = tokenizer.IdToPiece(token_id) text = piece.encode("utf-8") score = tokenizer.GetScore(token_id) From f5d0305d51de45f7f4642e16349f27b37862502a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 1 Jun 2025 20:55:03 +0200 Subject: [PATCH 08/13] merge tensor loading into general bert --- src/llama-model.cpp | 47 ++++++++------------------------------------- 1 file changed, 8 insertions(+), 39 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index e46403f3c2011..31b27106315c1 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2128,6 +2128,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { case LLM_ARCH_BERT: case LLM_ARCH_NOMIC_BERT: case LLM_ARCH_NOMIC_BERT_MOE: + case LLM_ARCH_JINA_BERT_V3: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED); @@ -2163,24 +2164,22 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); + layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0); if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) { - layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0); layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0); layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0); } else { - layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); - layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); - - if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) { - layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); - layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0); - layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0); - } else { + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); + layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); + + if (arch == LLM_ARCH_NOMIC_BERT) { layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); } } @@ -2232,36 +2231,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0); - layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0); - layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0); - } - } break; - case LLM_ARCH_JINA_BERT_V3: - { - tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); - type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); - - tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); - tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0); - - for (int i = 0; i < n_layer; ++i) { - auto & layer = layers[i]; - - layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0); - layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0); - - layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); - layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); - - layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); - layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0); - - layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); - layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0); - - layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); - layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0); - layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0); layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0); } From 3862d954bbfa85cc4301cc7cd61548956c09db01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 1 Jun 2025 21:46:15 +0200 Subject: [PATCH 09/13] rope --- src/llama-model.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 31b27106315c1..9ac88c27c5f79 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -5961,7 +5961,7 @@ struct llm_build_bert : public llm_graph_context { Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); // RoPE - if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) { + if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE || model.arch == LLM_ARCH_JINA_BERT_V3) { Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, From 9a39ccb7d96d3de1b9da84929392e704d25f7a92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 6 Jul 2025 16:48:37 +0200 Subject: [PATCH 10/13] add lora embedding and loading (non-functional) --- convert_hf_to_gguf.py | 30 +++++++++++++++++------- gguf-py/gguf/constants.py | 6 +++-- src/llama-adapter.h | 1 + src/llama-arch.cpp | 6 +++-- src/llama-arch.h | 2 ++ src/llama-model.cpp | 48 +++++++++++++++++++++++++++++++++++++++ src/llama-model.h | 4 ++++ 7 files changed, 85 insertions(+), 12 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 998d8723da282..c8b54894031f6 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4185,15 +4185,22 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if self._position_offset is not None: data_torch = data_torch[self._position_offset:,:] - if name.endswith(".lora_A"): - # TODO: convert loras - return [] - - if name.endswith(".lora_B"): - # TODO: convert loras - return [] + if name.endswith(".weight.0.lora_A") or name.endswith(".weight.0.lora_B"): + if name.startswith("pooler.dense"): + return + + lora_name = self.hparams["lora_adaptations"] + num_loras = data_torch.size(0) + assert num_loras == len(lora_name) + + # Split out each LoRA in their own named tensors + # Remove "weight" from the name to not confuse quantize + for i in range(num_loras): + data_lora = data_torch[i, :, :] + yield (self.map_tensor_name(name[:-16]) + name[-16:].lower().replace("weight.0.", f"<{lora_name[i]}>"), data_lora) + return - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) def set_gguf_parameters(self): super().set_gguf_parameters() @@ -4201,6 +4208,13 @@ def set_gguf_parameters(self): # jina-embeddings-v3 if rotary_emb_base := self.hparams.get("rotary_emb_base"): self.gguf_writer.add_rope_freq_base(rotary_emb_base) + if lora_alpha := self.hparams.get("lora_alpha"): + self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, lora_alpha) + if lora_names := self.hparams.get("lora_adaptations"): + self.gguf_writer.add_array(gguf.Keys.Adapter.LORA_NAMES, lora_names) + if lora_prompt_prefixes := self.hparams.get("task_instructions"): + assert lora_names and all(lora_name in lora_prompt_prefixes for lora_name in lora_names) + self.gguf_writer.add_array(gguf.Keys.Adapter.LORA_PROMPT_PREFIXES, [lora_prompt_prefixes[lora_name] for lora_name in lora_names]) @ModelBase.register("GemmaForCausalLM") diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 6d3d596d68b8f..5b62dc3caef9f 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -227,8 +227,10 @@ class Tokenizer: MIDDLE_ID = "tokenizer.ggml.middle_token_id" class Adapter: - TYPE = "adapter.type" - LORA_ALPHA = "adapter.lora.alpha" + TYPE = "adapter.type" + LORA_ALPHA = "adapter.lora.alpha" + LORA_NAMES = "adapter.lora.names" + LORA_PROMPT_PREFIXES = "adapter.lora.prompt_prefixes" class Clip: PROJECTOR_TYPE = "clip.projector_type" diff --git a/src/llama-adapter.h b/src/llama-adapter.h index 65824e972765b..1439ebdeb6c05 100644 --- a/src/llama-adapter.h +++ b/src/llama-adapter.h @@ -66,6 +66,7 @@ struct llama_adapter_lora { std::vector bufs; float alpha; + std::string prompt_prefix; llama_adapter_lora() = default; ~llama_adapter_lora() = default; diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index cbec319541aca..223aea9612760 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -217,8 +217,10 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" }, { LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" }, - { LLM_KV_ADAPTER_TYPE, "adapter.type" }, - { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" }, + { LLM_KV_ADAPTER_TYPE, "adapter.type" }, + { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" }, + { LLM_KV_ADAPTER_LORA_NAMES, "adapter.lora.names" }, + { LLM_KV_ADAPTER_LORA_PROMPT_PREFIXES, "adapter.lora.prompt_prefixes" }, // deprecated { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index 89608ca6306a8..657a3365a9d7b 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -215,6 +215,8 @@ enum llm_kv { LLM_KV_ADAPTER_TYPE, LLM_KV_ADAPTER_LORA_ALPHA, + LLM_KV_ADAPTER_LORA_NAMES, + LLM_KV_ADAPTER_LORA_PROMPT_PREFIXES, LLM_KV_POSNET_EMBEDDING_LENGTH, LLM_KV_POSNET_BLOCK_COUNT, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index e15a6cb836c75..6949f697afe10 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1720,6 +1720,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) { ggml_backend_buffer_type_t first_moved_from_buft = nullptr; ggml_backend_buffer_type_t first_moved_to_buft = nullptr; + auto add_lora_tensors = [&](const std::string & lora_name, const std::string & tensor_name) -> void { + std::string base_name = tensor_name.substr(0, tensor_name.size() - 6); + + ggml_tensor * lora_a = ml.get_tensor_meta((base_name + "<" + lora_name + ">lora_a").c_str()); + ggml_tensor * lora_b = ml.get_tensor_meta((base_name + "<" + lora_name + ">lora_b").c_str()); + loras[lora_name]->ab_map[tensor_name] = llama_adapter_lora_weight(lora_a, lora_b); + + ml.n_created += 2; + }; + auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list & ne, int flags) -> ggml_tensor * { ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str()); @@ -2246,6 +2256,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { case LLM_ARCH_NOMIC_BERT_MOE: case LLM_ARCH_JINA_BERT_V3: { + std::vector lora_names; + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED); @@ -2262,6 +2274,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) { tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0); + if (arch == LLM_ARCH_JINA_BERT_V3) { + float lora_alpha = 1.0f; + std::vector lora_prompt_prefixes; + + ml.get_key(LLM_KV_ADAPTER_LORA_ALPHA, lora_alpha, false); + ml.get_arr(LLM_KV_ADAPTER_LORA_NAMES, lora_names, false); + ml.get_arr(LLM_KV_ADAPTER_LORA_PROMPT_PREFIXES, lora_prompt_prefixes, false); + GGML_ASSERT(lora_names.size() == lora_prompt_prefixes.size()); + + for (size_t i = 0; i < lora_names.size(); ++i) { + llama_adapter_lora * adapter = new llama_adapter_lora(); + std::string lora_name = lora_names[i]; + + adapter->alpha = lora_alpha; + adapter->prompt_prefix = lora_prompt_prefixes[i]; + loras[lora_name] = adapter; + + add_lora_tensors(lora_name, tok_embd->name); + + if (type_embd) { + add_lora_tensors(lora_name, type_embd->name); + } + } + } + for (int i = 0; i < n_layer; ++i) { auto & layer = layers[i]; @@ -2300,6 +2337,17 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } + if (arch == LLM_ARCH_JINA_BERT_V3) { + GGML_ASSERT(layer.wqkv != nullptr); + + for (const auto & lora_name : lora_names) { + add_lora_tensors(lora_name, layer.wqkv->name); + add_lora_tensors(lora_name, layer.wo->name); + add_lora_tensors(lora_name, layer.ffn_up->name); + add_lora_tensors(lora_name, layer.ffn_down->name); + } + } + layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0); layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0); } diff --git a/src/llama-model.h b/src/llama-model.h index 9a7d8727b662c..21f217aaf4410 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -7,6 +7,7 @@ #include "llama-memory.h" #include "llama-vocab.h" +#include #include #include #include @@ -383,6 +384,9 @@ struct llama_model { llama_model_params params; + // built-in LoRAs + std::map loras; + // gguf metadata std::unordered_map gguf_kv; From 966d0e0e0bdc90a6dc24f5a3a3d3558a77b79fef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 6 Jul 2025 22:24:46 +0200 Subject: [PATCH 11/13] export separate lora ggufs instead --- common/arg.cpp | 4 +-- common/common.cpp | 2 ++ common/common.h | 3 ++ convert_hf_to_gguf.py | 67 ++++++++++++++++++++++++++++----------- gguf-py/gguf/constants.py | 8 ++--- include/llama.h | 6 ++++ src/llama-adapter.cpp | 10 ++++++ src/llama-adapter.h | 1 + src/llama-arch.cpp | 8 ++--- src/llama-arch.h | 4 +-- src/llama-model.cpp | 48 ---------------------------- src/llama-model.h | 4 --- tools/server/server.cpp | 2 ++ 13 files changed, 85 insertions(+), 82 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 40af7e574830f..b9502b819ceda 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2460,7 +2460,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--lora"}, "FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)", [](common_params & params, const std::string & value) { - params.lora_adapters.push_back({ std::string(value), 1.0, nullptr }); + params.lora_adapters.push_back({ std::string(value), 1.0, "", "", nullptr }); } // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); @@ -2468,7 +2468,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--lora-scaled"}, "FNAME", "SCALE", "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)", [](common_params & params, const std::string & fname, const std::string & scale) { - params.lora_adapters.push_back({ fname, std::stof(scale), nullptr }); + params.lora_adapters.push_back({ fname, std::stof(scale), "", "", nullptr }); } // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); diff --git a/common/common.cpp b/common/common.cpp index e4e71ad13fb59..5a97fe868457d 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -993,6 +993,8 @@ struct common_init_result common_init_from_params(common_params & params) { } la.ptr = lora.get(); + la.task_name = llama_adapter_lora_task_name(la.ptr); + la.prompt_prefix = llama_adapter_lora_prompt_prefix(la.ptr); iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters } diff --git a/common/common.h b/common/common.h index 8922090e7b10d..705295a80c1f2 100644 --- a/common/common.h +++ b/common/common.h @@ -31,6 +31,9 @@ struct common_adapter_lora_info { std::string path; float scale; + std::string task_name; + std::string prompt_prefix; + struct llama_adapter_lora * ptr; }; diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index c8b54894031f6..7be3035feac65 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -64,6 +64,7 @@ class ModelBase: endianess: gguf.GGUFEndian use_temp_file: bool lazy: bool + dry_run: bool part_names: list[str] is_safetensors: bool hparams: dict[str, Any] @@ -98,6 +99,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE self.use_temp_file = use_temp_file self.lazy = not eager or (remote_hf_model_id is not None) + self.dry_run = dry_run self.remote_hf_model_id = remote_hf_model_id if remote_hf_model_id is not None: self.is_safetensors = True @@ -4153,18 +4155,31 @@ def modify_tensors(self, data_torch, name, bid): @ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification") class XLMRobertaModel(BertModel): model_arch = gguf.MODEL_ARCH.BERT + _lora_files = {} def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any): hparams = kwargs.pop("hparams", None) if hparams is None: hparams = ModelBase.load_hparams(dir_model) - if hparams.get("lora_adaptations"): + if lora_names := hparams.get("lora_adaptations"): self.model_arch = gguf.MODEL_ARCH.JINA_BERT_V3 super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs) + + if lora_names: + for name in lora_names: + fname = self.add_prefix_to_filename(self.fname_out, f"lora-{name}-") + self._lora_files[name] = gguf.GGUFWriter(fname, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file, dry_run=self.dry_run) + self._xlmroberta_tokenizer_init() + def set_type(self): + for lora_writer in self._lora_files.values(): + lora_writer.add_type(gguf.GGUFType.ADAPTER) + lora_writer.add_string(gguf.Keys.Adapter.TYPE, "lora") + super().set_type() + def set_vocab(self): self._xlmroberta_set_vocab() @@ -4185,22 +4200,29 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if self._position_offset is not None: data_torch = data_torch[self._position_offset:,:] - if name.endswith(".weight.0.lora_A") or name.endswith(".weight.0.lora_B"): + if name.endswith(".0.lora_A") or name.endswith(".0.lora_B"): if name.startswith("pooler.dense"): - return + return [] - lora_name = self.hparams["lora_adaptations"] num_loras = data_torch.size(0) - assert num_loras == len(lora_name) + assert num_loras == len(self._lora_files) + + # Split out each LoRA in their own GGUF + for i, lora_writer in enumerate(self._lora_files.values()): + new_name = self.map_tensor_name(name[:-9]) + name[-7:].lower() + data_qtype = gguf.GGMLQuantizationType.F32 + data = data_torch[i, :, :] + # Transpose/flip token_embd/types into correct shape + if new_name == "token_embd.weight.lora_b": + data = data.T + elif new_name.startswith("token_types.weight."): + new_name = new_name[:-1] + ("a" if new_name[-1:] == "b" else "b") + data = gguf.quants.quantize(data.numpy(), data_qtype) + lora_writer.add_tensor(new_name, data, raw_dtype=data_qtype) - # Split out each LoRA in their own named tensors - # Remove "weight" from the name to not confuse quantize - for i in range(num_loras): - data_lora = data_torch[i, :, :] - yield (self.map_tensor_name(name[:-16]) + name[-16:].lower().replace("weight.0.", f"<{lora_name[i]}>"), data_lora) - return + return [] - yield from super().modify_tensors(data_torch, name, bid) + return super().modify_tensors(data_torch, name, bid) def set_gguf_parameters(self): super().set_gguf_parameters() @@ -4208,13 +4230,22 @@ def set_gguf_parameters(self): # jina-embeddings-v3 if rotary_emb_base := self.hparams.get("rotary_emb_base"): self.gguf_writer.add_rope_freq_base(rotary_emb_base) - if lora_alpha := self.hparams.get("lora_alpha"): - self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, lora_alpha) - if lora_names := self.hparams.get("lora_adaptations"): - self.gguf_writer.add_array(gguf.Keys.Adapter.LORA_NAMES, lora_names) + lora_alpha = self.hparams.get("lora_alpha") if lora_prompt_prefixes := self.hparams.get("task_instructions"): - assert lora_names and all(lora_name in lora_prompt_prefixes for lora_name in lora_names) - self.gguf_writer.add_array(gguf.Keys.Adapter.LORA_PROMPT_PREFIXES, [lora_prompt_prefixes[lora_name] for lora_name in lora_names]) + assert self._lora_files and all(lora_name in lora_prompt_prefixes for lora_name in self._lora_files.keys()) + for lora_name, lora_writer in self._lora_files.items(): + lora_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, lora_alpha if lora_alpha is not None else 1.0) + lora_writer.add_string(gguf.Keys.Adapter.LORA_TASK_NAME, lora_name) + if lora_prompt_prefixes: + lora_writer.add_string(gguf.Keys.Adapter.LORA_PROMPT_PREFIX, lora_prompt_prefixes[lora_name]) + + def write(self): + super().write() + for lora_writer in self._lora_files.values(): + lora_writer.write_header_to_file() + lora_writer.write_kv_data_to_file() + lora_writer.write_tensors_to_file(progress=True) + lora_writer.close() @ModelBase.register("GemmaForCausalLM") diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 5b62dc3caef9f..fbb2f58b8a69a 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -227,10 +227,10 @@ class Tokenizer: MIDDLE_ID = "tokenizer.ggml.middle_token_id" class Adapter: - TYPE = "adapter.type" - LORA_ALPHA = "adapter.lora.alpha" - LORA_NAMES = "adapter.lora.names" - LORA_PROMPT_PREFIXES = "adapter.lora.prompt_prefixes" + TYPE = "adapter.type" + LORA_ALPHA = "adapter.lora.alpha" + LORA_TASK_NAME = "adapter.lora.task_name" + LORA_PROMPT_PREFIX = "adapter.lora.prompt_prefix" class Clip: PROJECTOR_TYPE = "clip.projector_type" diff --git a/include/llama.h b/include/llama.h index 3eda9bc68608c..7b5279179dae6 100644 --- a/include/llama.h +++ b/include/llama.h @@ -588,6 +588,12 @@ extern "C" { struct llama_model * model, const char * path_lora); + // Get the LoRA task name. Returns a blank string if not applicable + LLAMA_API const char * llama_adapter_lora_task_name(struct llama_adapter_lora * adapter); + + // Get the required LoRA prompt prefix. Returns a blank string if not applicable + LLAMA_API const char * llama_adapter_lora_prompt_prefix(struct llama_adapter_lora * adapter); + // Manually free a LoRA adapter // Note: loaded adapters will be free when the associated model is deleted LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter); diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index 8d94034aed95d..f79f60582d303 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -190,6 +190,8 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_ } adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA)); + adapter.task_name = get_kv_str(llm_kv(LLM_KV_ADAPTER_LORA_TASK_NAME)); + adapter.prompt_prefix = get_kv_str(llm_kv(LLM_KV_ADAPTER_LORA_PROMPT_PREFIX)); } int n_tensors = gguf_get_n_tensors(ctx_gguf.get()); @@ -383,6 +385,14 @@ llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * p return nullptr; } +const char * llama_adapter_lora_task_name(llama_adapter_lora * adapter) { + return adapter->task_name.c_str(); +} + +const char * llama_adapter_lora_prompt_prefix(llama_adapter_lora * adapter) { + return adapter->prompt_prefix.c_str(); +} + void llama_adapter_lora_free(llama_adapter_lora * adapter) { delete adapter; } diff --git a/src/llama-adapter.h b/src/llama-adapter.h index 1439ebdeb6c05..c95618e1b21fe 100644 --- a/src/llama-adapter.h +++ b/src/llama-adapter.h @@ -66,6 +66,7 @@ struct llama_adapter_lora { std::vector bufs; float alpha; + std::string task_name; std::string prompt_prefix; llama_adapter_lora() = default; diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 223aea9612760..968f661cb4c77 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -217,10 +217,10 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" }, { LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" }, - { LLM_KV_ADAPTER_TYPE, "adapter.type" }, - { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" }, - { LLM_KV_ADAPTER_LORA_NAMES, "adapter.lora.names" }, - { LLM_KV_ADAPTER_LORA_PROMPT_PREFIXES, "adapter.lora.prompt_prefixes" }, + { LLM_KV_ADAPTER_TYPE, "adapter.type" }, + { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" }, + { LLM_KV_ADAPTER_LORA_TASK_NAME, "adapter.lora.task_name" }, + { LLM_KV_ADAPTER_LORA_PROMPT_PREFIX, "adapter.lora.prompt_prefix" }, // deprecated { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index 657a3365a9d7b..e4f0c861f094d 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -215,8 +215,8 @@ enum llm_kv { LLM_KV_ADAPTER_TYPE, LLM_KV_ADAPTER_LORA_ALPHA, - LLM_KV_ADAPTER_LORA_NAMES, - LLM_KV_ADAPTER_LORA_PROMPT_PREFIXES, + LLM_KV_ADAPTER_LORA_TASK_NAME, + LLM_KV_ADAPTER_LORA_PROMPT_PREFIX, LLM_KV_POSNET_EMBEDDING_LENGTH, LLM_KV_POSNET_BLOCK_COUNT, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 6949f697afe10..e15a6cb836c75 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1720,16 +1720,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { ggml_backend_buffer_type_t first_moved_from_buft = nullptr; ggml_backend_buffer_type_t first_moved_to_buft = nullptr; - auto add_lora_tensors = [&](const std::string & lora_name, const std::string & tensor_name) -> void { - std::string base_name = tensor_name.substr(0, tensor_name.size() - 6); - - ggml_tensor * lora_a = ml.get_tensor_meta((base_name + "<" + lora_name + ">lora_a").c_str()); - ggml_tensor * lora_b = ml.get_tensor_meta((base_name + "<" + lora_name + ">lora_b").c_str()); - loras[lora_name]->ab_map[tensor_name] = llama_adapter_lora_weight(lora_a, lora_b); - - ml.n_created += 2; - }; - auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list & ne, int flags) -> ggml_tensor * { ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str()); @@ -2256,8 +2246,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { case LLM_ARCH_NOMIC_BERT_MOE: case LLM_ARCH_JINA_BERT_V3: { - std::vector lora_names; - tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED); @@ -2274,31 +2262,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0); - if (arch == LLM_ARCH_JINA_BERT_V3) { - float lora_alpha = 1.0f; - std::vector lora_prompt_prefixes; - - ml.get_key(LLM_KV_ADAPTER_LORA_ALPHA, lora_alpha, false); - ml.get_arr(LLM_KV_ADAPTER_LORA_NAMES, lora_names, false); - ml.get_arr(LLM_KV_ADAPTER_LORA_PROMPT_PREFIXES, lora_prompt_prefixes, false); - GGML_ASSERT(lora_names.size() == lora_prompt_prefixes.size()); - - for (size_t i = 0; i < lora_names.size(); ++i) { - llama_adapter_lora * adapter = new llama_adapter_lora(); - std::string lora_name = lora_names[i]; - - adapter->alpha = lora_alpha; - adapter->prompt_prefix = lora_prompt_prefixes[i]; - loras[lora_name] = adapter; - - add_lora_tensors(lora_name, tok_embd->name); - - if (type_embd) { - add_lora_tensors(lora_name, type_embd->name); - } - } - } - for (int i = 0; i < n_layer; ++i) { auto & layer = layers[i]; @@ -2337,17 +2300,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } - if (arch == LLM_ARCH_JINA_BERT_V3) { - GGML_ASSERT(layer.wqkv != nullptr); - - for (const auto & lora_name : lora_names) { - add_lora_tensors(lora_name, layer.wqkv->name); - add_lora_tensors(lora_name, layer.wo->name); - add_lora_tensors(lora_name, layer.ffn_up->name); - add_lora_tensors(lora_name, layer.ffn_down->name); - } - } - layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0); layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0); } diff --git a/src/llama-model.h b/src/llama-model.h index 21f217aaf4410..9a7d8727b662c 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -7,7 +7,6 @@ #include "llama-memory.h" #include "llama-vocab.h" -#include #include #include #include @@ -384,9 +383,6 @@ struct llama_model { llama_model_params params; - // built-in LoRAs - std::map loras; - // gguf metadata std::unordered_map gguf_kv; diff --git a/tools/server/server.cpp b/tools/server/server.cpp index d3f6271931f62..7142291071c83 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -4761,6 +4761,8 @@ int main(int argc, char ** argv) { {"id", i}, {"path", lora.path}, {"scale", lora.scale}, + {"task_name", lora.task_name}, + {"prompt_prefix", lora.prompt_prefix}, }); } res_ok(res, result); From bea6d0617ee48504f62f2205a3e941e8248cbdf8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Mon, 7 Jul 2025 07:24:28 +0200 Subject: [PATCH 12/13] add adapter metadata api --- common/common.cpp | 4 +-- common/common.h | 4 +-- include/llama.h | 20 +++++++++--- src/llama-adapter.cpp | 74 +++++++++++++++++++++++++++++++++++++------ src/llama-adapter.h | 5 +-- 5 files changed, 87 insertions(+), 20 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 5a97fe868457d..9567f99b877e5 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -993,8 +993,8 @@ struct common_init_result common_init_from_params(common_params & params) { } la.ptr = lora.get(); - la.task_name = llama_adapter_lora_task_name(la.ptr); - la.prompt_prefix = llama_adapter_lora_prompt_prefix(la.ptr); + llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", la.task_name, sizeof(la.task_name)); + llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", la.prompt_prefix, sizeof(la.prompt_prefix)); iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters } diff --git a/common/common.h b/common/common.h index 705295a80c1f2..42e1c29e3ba02 100644 --- a/common/common.h +++ b/common/common.h @@ -31,8 +31,8 @@ struct common_adapter_lora_info { std::string path; float scale; - std::string task_name; - std::string prompt_prefix; + char task_name[64]; + char prompt_prefix[256]; struct llama_adapter_lora * ptr; }; diff --git a/include/llama.h b/include/llama.h index 7b5279179dae6..51bc73c7718d8 100644 --- a/include/llama.h +++ b/include/llama.h @@ -588,11 +588,23 @@ extern "C" { struct llama_model * model, const char * path_lora); - // Get the LoRA task name. Returns a blank string if not applicable - LLAMA_API const char * llama_adapter_lora_task_name(struct llama_adapter_lora * adapter); + // Functions to access the adapter's GGUF metadata scalar values + // - The functions return the length of the string on success, or -1 on failure + // - The output string is always null-terminated and cleared on failure + // - When retrieving a string, an extra byte must be allocated to account for the null terminator + // - GGUF array values are not supported by these functions + + // Get metadata value as a string by key name + LLAMA_API int32_t llama_adapter_meta_val_str(const struct llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size); - // Get the required LoRA prompt prefix. Returns a blank string if not applicable - LLAMA_API const char * llama_adapter_lora_prompt_prefix(struct llama_adapter_lora * adapter); + // Get the number of metadata key/value pairs + LLAMA_API int32_t llama_adapter_meta_count(const struct llama_adapter_lora * adapter); + + // Get metadata key name by index + LLAMA_API int32_t llama_adapter_meta_key_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size); + + // Get metadata value as a string by index + LLAMA_API int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size); // Manually free a LoRA adapter // Note: loaded adapters will be free when the associated model is deleted diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index f79f60582d303..772ce1b448088 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -163,13 +163,38 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_ // check metadata { + const gguf_context * gguf_ctx = ctx_gguf.get(); + + LLAMA_LOG_INFO("%s: Dumping metadata keys/values.\n", __func__); + + // get metadata as string + for (int i = 0; i < gguf_get_n_kv(gguf_ctx); i++) { + gguf_type type = gguf_get_kv_type(gguf_ctx, i); + const std::string type_name = + type == GGUF_TYPE_ARRAY + ? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(gguf_ctx, i)), gguf_get_arr_n(gguf_ctx, i)) + : gguf_type_name(type); + const char * name = gguf_get_key(gguf_ctx, i); + const std::string value = gguf_kv_to_str(gguf_ctx, i); + + if (type != GGUF_TYPE_ARRAY) { + adapter.gguf_kv.emplace(name, value); + } + + const size_t MAX_VALUE_LEN = 40; + std::string print_value = value.size() > MAX_VALUE_LEN ? format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()) : value; + replace_all(print_value, "\n", "\\n"); + + LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), print_value.c_str()); + } + auto get_kv_str = [&](const std::string & key) -> std::string { - int id = gguf_find_key(ctx_gguf.get(), key.c_str()); - return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf.get(), id)); + int id = gguf_find_key(gguf_ctx, key.c_str()); + return id < 0 ? "" : std::string(gguf_get_val_str(gguf_ctx, id)); }; auto get_kv_f32 = [&](const std::string & key) -> float { - int id = gguf_find_key(ctx_gguf.get(), key.c_str()); - return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf.get(), id); + int id = gguf_find_key(gguf_ctx, key.c_str()); + return id < 0 ? 0.0f : gguf_get_val_f32(gguf_ctx, id); }; LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN); @@ -190,8 +215,6 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_ } adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA)); - adapter.task_name = get_kv_str(llm_kv(LLM_KV_ADAPTER_LORA_TASK_NAME)); - adapter.prompt_prefix = get_kv_str(llm_kv(LLM_KV_ADAPTER_LORA_PROMPT_PREFIX)); } int n_tensors = gguf_get_n_tensors(ctx_gguf.get()); @@ -385,12 +408,43 @@ llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * p return nullptr; } -const char * llama_adapter_lora_task_name(llama_adapter_lora * adapter) { - return adapter->task_name.c_str(); +int32_t llama_adapter_meta_val_str(const llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size) { + const auto & it = adapter->gguf_kv.find(key); + if (it == adapter->gguf_kv.end()) { + if (buf_size > 0) { + buf[0] = '\0'; + } + return -1; + } + return snprintf(buf, buf_size, "%s", it->second.c_str()); } -const char * llama_adapter_lora_prompt_prefix(llama_adapter_lora * adapter) { - return adapter->prompt_prefix.c_str(); +int32_t llama_adapter_meta_count(const llama_adapter_lora * adapter) { + return (int)adapter->gguf_kv.size(); +} + +int32_t llama_adapter_meta_key_by_index(const llama_adapter_lora * adapter, int i, char * buf, size_t buf_size) { + if (i < 0 || i >= (int)adapter->gguf_kv.size()) { + if (buf_size > 0) { + buf[0] = '\0'; + } + return -1; + } + auto it = adapter->gguf_kv.begin(); + std::advance(it, i); + return snprintf(buf, buf_size, "%s", it->first.c_str()); +} + +int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size) { + if (i < 0 || i >= (int)adapter->gguf_kv.size()) { + if (buf_size > 0) { + buf[0] = '\0'; + } + return -1; + } + auto it = adapter->gguf_kv.begin(); + std::advance(it, i); + return snprintf(buf, buf_size, "%s", it->second.c_str()); } void llama_adapter_lora_free(llama_adapter_lora * adapter) { diff --git a/src/llama-adapter.h b/src/llama-adapter.h index c95618e1b21fe..9084e7cab08fd 100644 --- a/src/llama-adapter.h +++ b/src/llama-adapter.h @@ -66,8 +66,9 @@ struct llama_adapter_lora { std::vector bufs; float alpha; - std::string task_name; - std::string prompt_prefix; + + // gguf metadata + std::unordered_map gguf_kv; llama_adapter_lora() = default; ~llama_adapter_lora() = default; From 9b67ea265b21be5fade31ab64bdc7e1a983fa682 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Tue, 8 Jul 2025 18:11:19 +0200 Subject: [PATCH 13/13] use std::string --- common/common.cpp | 8 ++++++-- common/common.h | 4 ++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 9567f99b877e5..420ee6cd1c8a0 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -981,6 +981,8 @@ struct common_init_result common_init_from_params(common_params & params) { } } + char buf[1024]; + // load and optionally apply lora adapters for (auto & la : params.lora_adapters) { llama_adapter_lora_ptr lora; @@ -993,8 +995,10 @@ struct common_init_result common_init_from_params(common_params & params) { } la.ptr = lora.get(); - llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", la.task_name, sizeof(la.task_name)); - llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", la.prompt_prefix, sizeof(la.prompt_prefix)); + llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf)); + la.task_name = buf; + llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf)); + la.prompt_prefix = buf; iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters } diff --git a/common/common.h b/common/common.h index 42e1c29e3ba02..705295a80c1f2 100644 --- a/common/common.h +++ b/common/common.h @@ -31,8 +31,8 @@ struct common_adapter_lora_info { std::string path; float scale; - char task_name[64]; - char prompt_prefix[256]; + std::string task_name; + std::string prompt_prefix; struct llama_adapter_lora * ptr; };