Skip to content

Commit 7d91690

Browse files
committed
Merge branch 'master' into croco_nex_0
2 parents 0395eb5 + bd38665 commit 7d91690

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+194623
-192222
lines changed

Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -719,11 +719,12 @@ cleanocuda:
719719
# useful tools
720720
main: examples/main/main.cpp common/json-schema-to-grammar.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
721721
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
722-
@echo '==== Run ./main -h for help. ===='
723722
sdmain: otherarch/sdcpp/util.cpp otherarch/sdcpp/main.cpp otherarch/sdcpp/stable-diffusion.cpp otherarch/sdcpp/upscaler.cpp otherarch/sdcpp/model.cpp otherarch/sdcpp/thirdparty/zip.c build-info.h ggml.o ggml-cpu.o llama.o console.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
724723
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
725724
whispermain: otherarch/whispercpp/main.cpp otherarch/whispercpp/whisper.cpp build-info.h ggml.o ggml-cpu.o llama.o console.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
726725
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
726+
ttsmain: examples/tts/tts.cpp common/json-schema-to-grammar.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
727+
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
727728
gguf-split: examples/gguf-split/gguf-split.cpp ggml.o ggml-cpu.o llama.o build-info.h llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
728729
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
729730

README.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -404,4 +404,10 @@ when you can't use the precompiled binary directly, we provide an automated buil
404404
- [Stable Diffusion 1.5 and SDXL safetensor models](https://github.com/LostRuins/koboldcpp/wiki#can-i-generate-images-with-koboldcpp)
405405
- [LLaVA based Vision models and multimodal projectors (mmproj)](https://github.com/LostRuins/koboldcpp/wiki#what-is-llava-and-mmproj)
406406
- [Whisper models for Speech-To-Text](https://huggingface.co/koboldcpp/whisper/tree/main)
407-
407+
408+
# Where can I download AI model files?
409+
- The best place to get GGUF text models is huggingface. For image models, CivitAI has a good selection. Here are some to get started.
410+
- Text Generation: [BookAdventures 8B](https://huggingface.co/KoboldAI/Llama-3.1-8B-BookAdventures-GGUF/resolve/main/Llama-3.1-8B-BookAdventures.Q4_K_S.gguf) or [Tiefighter 13B](https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf) (larger model).
411+
- Image Generation: [Anything v3](https://huggingface.co/admruul/anything-v3.0/resolve/main/Anything-V3.0-pruned-fp16.safetensors) or [Deliberate V2](https://huggingface.co/Yntec/Deliberate2/resolve/main/Deliberate_v2.safetensors) or [Dreamshaper SDXL](https://huggingface.co/Lykon/dreamshaper-xl-v2-turbo/resolve/main/DreamShaperXL_Turbo_v2_1.safetensors)
412+
- Image Recognition MMproj: [Pick the correct one for your model architecture here](https://huggingface.co/koboldcpp/mmproj/tree/main)
413+
- Speech Recognition: [Whisper models for Speech-To-Text](https://huggingface.co/koboldcpp/whisper/tree/main)

convert_hf_to_gguf.py

Lines changed: 134 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,7 @@ def prepare_tensors(self):
326326
gguf.MODEL_TENSOR.TIME_MIX_W2,
327327
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
328328
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
329+
gguf.MODEL_TENSOR.TIME_MIX_LERP_FUSED,
329330
gguf.MODEL_TENSOR.POSNET_NORM1,
330331
gguf.MODEL_TENSOR.POSNET_NORM2,
331332
)
@@ -2562,6 +2563,63 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
25622563
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
25632564

25642565

2566+
@Model.register("PhiMoEForCausalLM")
2567+
class PhiMoeModel(Phi3MiniModel):
2568+
model_arch = gguf.MODEL_ARCH.PHIMOE
2569+
2570+
_experts: list[dict[str, Tensor]] | None = None
2571+
2572+
def set_gguf_parameters(self):
2573+
super().set_gguf_parameters()
2574+
self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
2575+
self.gguf_writer.add_expert_count(self.hparams["num_local_experts"])
2576+
2577+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2578+
# process the experts separately
2579+
if name.find("block_sparse_moe.experts") != -1:
2580+
n_experts = self.hparams["num_local_experts"]
2581+
assert bid is not None
2582+
2583+
if self._experts is None:
2584+
self._experts = [{} for _ in range(self.block_count)]
2585+
2586+
self._experts[bid][name] = data_torch
2587+
2588+
if len(self._experts[bid]) >= n_experts * 3:
2589+
tensors: list[tuple[str, Tensor]] = []
2590+
2591+
# merge the experts into a single 3d tensor
2592+
for w_name in ["w1", "w2", "w3"]:
2593+
datas: list[Tensor] = []
2594+
2595+
for xid in range(n_experts):
2596+
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight"
2597+
datas.append(self._experts[bid][ename])
2598+
del self._experts[bid][ename]
2599+
2600+
data_torch = torch.stack(datas, dim=0)
2601+
2602+
merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight"
2603+
2604+
new_name = self.map_tensor_name(merged_name)
2605+
2606+
tensors.append((new_name, data_torch))
2607+
return tensors
2608+
else:
2609+
return []
2610+
2611+
return [(self.map_tensor_name(name), data_torch)]
2612+
2613+
def prepare_tensors(self):
2614+
super().prepare_tensors()
2615+
2616+
if self._experts is not None:
2617+
# flatten `list[dict[str, Tensor]]` into `list[str]`
2618+
experts = [k for d in self._experts for k in d.keys()]
2619+
if len(experts) > 0:
2620+
raise ValueError(f"Unprocessed experts: {experts}")
2621+
2622+
25652623
@Model.register("PlamoForCausalLM")
25662624
class PlamoModel(Model):
25672625
model_arch = gguf.MODEL_ARCH.PLAMO
@@ -3259,6 +3317,8 @@ def set_gguf_parameters(self):
32593317
# required by llama.cpp, unused
32603318
self.gguf_writer.add_head_count(0)
32613319

3320+
lerp_weights: dict[int, dict[str, Tensor]] = {}
3321+
32623322
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
32633323
new_name = self.map_tensor_name(name)
32643324

@@ -3274,14 +3334,84 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
32743334
if new_name.endswith("time_mix_decay.weight") or "lerp" in new_name:
32753335
data_torch = data_torch.squeeze()
32763336

3277-
rescale_every_n_layers = self.hparams["rescale_every"]
3278-
if rescale_every_n_layers > 0:
3279-
if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"):
3280-
data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers))
3337+
try:
3338+
rescale_every_n_layers = self.hparams["rescale_every"]
3339+
if rescale_every_n_layers > 0:
3340+
if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"):
3341+
data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers))
3342+
except KeyError:
3343+
pass
3344+
3345+
# concat time_mix_lerp weights to reduce some cpu overhead
3346+
# also reduces the number of tensors in the model
3347+
if bid is not None and "time_mix_lerp" in new_name and "time_mix_lerp_x" not in new_name:
3348+
try:
3349+
self.lerp_weights[bid][new_name] = data_torch
3350+
except KeyError:
3351+
self.lerp_weights[bid] = {new_name: data_torch}
3352+
if all(f"blk.{bid}.time_mix_lerp_{i}.weight" in self.lerp_weights[bid].keys() for i in ["w", "k", "v", "r", "g"]):
3353+
new_name = f"blk.{bid}.time_mix_lerp_fused.weight"
3354+
data = torch.stack([self.lerp_weights[bid][f"blk.{bid}.time_mix_lerp_{i}.weight"].unsqueeze(0) for i in ["w", "k", "v", "r", "g"]], dim=0).unsqueeze(1)
3355+
yield (new_name, data)
3356+
return
32813357

32823358
yield (new_name, data_torch)
32833359

32843360

3361+
@Model.register("RWKV6Qwen2ForCausalLM")
3362+
class RWKV6Qwen2Model(Rwkv6Model):
3363+
model_arch = gguf.MODEL_ARCH.RWKV6QWEN2
3364+
3365+
def set_vocab(self):
3366+
try:
3367+
self._set_vocab_sentencepiece()
3368+
except FileNotFoundError:
3369+
self._set_vocab_gpt2()
3370+
3371+
def set_gguf_parameters(self):
3372+
block_count = self.hparams["num_hidden_layers"]
3373+
num_attention_heads = self.hparams["num_attention_heads"]
3374+
num_key_value_heads = self.hparams["num_key_value_heads"]
3375+
hidden_size = self.hparams["hidden_size"]
3376+
head_size = hidden_size // num_attention_heads
3377+
rms_norm_eps = self.hparams["rms_norm_eps"]
3378+
intermediate_size = self.hparams["intermediate_size"]
3379+
time_mix_extra_dim = 64 if hidden_size >= 4096 else 32
3380+
time_decay_extra_dim = 128 if hidden_size >= 4096 else 64
3381+
3382+
# RWKV isn't context limited
3383+
self.gguf_writer.add_context_length(1048576)
3384+
self.gguf_writer.add_embedding_length(hidden_size)
3385+
self.gguf_writer.add_block_count(block_count)
3386+
self.gguf_writer.add_wkv_head_size(head_size)
3387+
self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
3388+
self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
3389+
self.gguf_writer.add_feed_forward_length(intermediate_size)
3390+
self.gguf_writer.add_file_type(self.ftype)
3391+
3392+
# special parameters for time_mixing in RWKV6QWEN2
3393+
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
3394+
self.gguf_writer.add_token_shift_count(1)
3395+
# RWKV6QWEN2 use grouped key/value like GQA
3396+
self.gguf_writer.add_head_count_kv(num_key_value_heads)
3397+
3398+
# required by llama.cpp, unused
3399+
self.gguf_writer.add_head_count(0)
3400+
3401+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3402+
for new_name, data in super().modify_tensors(data_torch, name, bid):
3403+
if "time_mix_w1" in new_name or "time_mix_w2" in new_name:
3404+
data = data.view(5, -1, data.shape[-1])
3405+
# rwkv6qwen2 has a different order of rkvwg instead of the original wkvrg
3406+
# permute them here to avoid code changes
3407+
data = torch.stack([data[3], data[1], data[2], data[0], data[4]], dim=0).view(-1, data.shape[-1])
3408+
if "w2" in new_name:
3409+
data = data.view(5, -1, data.shape[-1])
3410+
yield (new_name, data)
3411+
continue
3412+
yield (new_name, data)
3413+
3414+
32853415
@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
32863416
class MambaModel(Model):
32873417
model_arch = gguf.MODEL_ARCH.MAMBA

cudart64_12.dll

25.5 KB
Binary file not shown.

0 commit comments

Comments
 (0)