Skip to content

Commit 80d9d2a

Browse files
committed
Merge branch 'master' into compilade/batch-splits
2 parents 1be5ea7 + b40eb84 commit 80d9d2a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+2500
-658
lines changed

.devops/llama-cli-cann.Dockerfile

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
2+
3+
FROM cosdt/cann:$ASCEND_VERSION AS build
4+
5+
WORKDIR /app
6+
7+
COPY . .
8+
9+
RUN yum install -y gcc g++ cmake make
10+
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
11+
ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
12+
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
13+
ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
14+
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
15+
ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
16+
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
17+
ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
18+
ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
19+
20+
# find libascend_hal.so, because the drive hasn`t been mounted.
21+
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
22+
23+
RUN echo "Building with static libs" && \
24+
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
25+
cmake -B build -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF && \
26+
cmake --build build --config Release --target llama-cli
27+
28+
# TODO: use image with NNRT
29+
FROM cosdt/cann:$ASCEND_VERSION AS runtime
30+
COPY --from=build /app/build/bin/llama-cli /llama-cli
31+
32+
ENV LC_ALL=C.utf8
33+
34+
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
35+
ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
36+
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
37+
ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
38+
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
39+
ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
40+
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
41+
ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
42+
ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
43+
44+
ENTRYPOINT ["/llama-cli" ]

.github/workflows/bench.yml renamed to .github/workflows/bench.yml.disabled

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
# TODO: there have been some issues with the workflow, so disabling for now
2+
# https://github.com/ggerganov/llama.cpp/issues/7893
3+
#
14
# Benchmark
25
name: Benchmark
36

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,3 +129,6 @@ poetry.toml
129129

130130
# Scripts
131131
!/scripts/install-oneapi.bat
132+
133+
# Test models for lora adapters
134+
/lora-tests

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,8 @@ Typically finetunes of the base models below are supported as well.
105105
- [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca)
106106
- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b)
107107
- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
108+
- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
109+
- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
108110

109111
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
110112

@@ -424,6 +426,7 @@ Please refer to [Build llama.cpp locally](./docs/build.md)
424426
| [CUDA](./docs/build.md#cuda) | Nvidia GPU |
425427
| [hipBLAS](./docs/build.md#hipblas) | AMD GPU |
426428
| [Vulkan](./docs/build.md#vulkan) | GPU |
429+
| [CANN](./docs/build.md#cann) | Ascend NPU |
427430

428431
## Tools
429432

common/common.cpp

Lines changed: 34 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -110,8 +110,34 @@ int32_t cpu_get_num_physical_cores() {
110110
if (result == 0) {
111111
return num_physical_cores;
112112
}
113-
#elif defined(_WIN32)
114-
//TODO: Implement
113+
#elif defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
114+
// TODO: windows + arm64 + mingw64
115+
unsigned int n_threads_win = std::thread::hardware_concurrency();
116+
unsigned int default_threads = n_threads_win > 0 ? (n_threads_win <= 4 ? n_threads_win : n_threads_win / 2) : 4;
117+
118+
DWORD buffer_size = 0;
119+
if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &buffer_size)) {
120+
if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
121+
return default_threads;
122+
}
123+
}
124+
125+
std::vector<char> buffer(buffer_size);
126+
if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data()), &buffer_size)) {
127+
return default_threads;
128+
}
129+
130+
int32_t num_physical_cores = 0;
131+
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data());
132+
while (buffer_size > 0) {
133+
if (info->Relationship == RelationProcessorCore) {
134+
num_physical_cores += info->Processor.GroupCount;
135+
}
136+
buffer_size -= info->Size;
137+
info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(reinterpret_cast<char*>(info) + info->Size);
138+
}
139+
140+
return num_physical_cores > 0 ? num_physical_cores : default_threads;
115141
#endif
116142
unsigned int n_threads = std::thread::hardware_concurrency();
117143
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
@@ -1727,7 +1753,13 @@ std::string gpt_params_get_system_info(const gpt_params & params) {
17271753
if (params.n_threads_batch != -1) {
17281754
os << " (n_threads_batch = " << params.n_threads_batch << ")";
17291755
}
1756+
#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
1757+
// TODO: windows + arm64 + mingw64
1758+
DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
1759+
os << " / " << logicalProcessorCount << " | " << llama_print_system_info();
1760+
#else
17301761
os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
1762+
#endif
17311763

17321764
return os.str();
17331765
}
@@ -2702,12 +2734,6 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
27022734
return text;
27032735
}
27042736

2705-
bool llama_should_add_bos_token(const llama_model * model) {
2706-
const int add_bos = llama_add_bos_token(model);
2707-
2708-
return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
2709-
}
2710-
27112737
//
27122738
// Chat template utils
27132739
//

common/common.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -380,10 +380,6 @@ std::string llama_detokenize(
380380
const std::vector<llama_token> & tokens,
381381
bool special = true);
382382

383-
// Uses the value from the model metadata if possible, otherwise
384-
// defaults to true when model type is SPM, otherwise false.
385-
bool llama_should_add_bos_token(const llama_model * model);
386-
387383
//
388384
// Chat template utils
389385
//

convert_hf_to_gguf.py

Lines changed: 131 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,7 @@ def prepare_tensors(self):
295295
gguf.MODEL_TENSOR.FFN_GATE_INP,
296296
gguf.MODEL_TENSOR.POS_EMBD,
297297
gguf.MODEL_TENSOR.TOKEN_TYPES,
298+
gguf.MODEL_TENSOR.SSM_CONV1D,
298299
)
299300
)
300301
or not name.endswith(".weight")
@@ -590,6 +591,15 @@ def get_vocab_base_pre(self, tokenizer) -> str:
590591
if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
591592
# ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
592593
res = "smollm"
594+
if chkhsh == "3c30d3ad1d6b64202cd222813e7736c2db6e1bd6d67197090fc1211fbc612ae7":
595+
# ref: https://huggingface.co/bigscience/bloom
596+
res = "bloom"
597+
if chkhsh == "bc01ce58980e1db43859146dc51b1758b3b88729b217a74792e9f8d43e479d21":
598+
# ref: https://huggingface.co/TurkuNLP/gpt3-finnish-small
599+
res = "gpt3-finnish"
600+
if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
601+
# ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
602+
res = "exaone"
593603

594604
if res is None:
595605
logger.warning("\n")
@@ -893,7 +903,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
893903
return tensors
894904

895905

896-
@Model.register("BloomForCausalLM")
906+
@Model.register("BloomForCausalLM", "BloomModel")
897907
class BloomModel(Model):
898908
model_arch = gguf.MODEL_ARCH.BLOOM
899909

@@ -2702,7 +2712,7 @@ class StarCoder2Model(Model):
27022712
model_arch = gguf.MODEL_ARCH.STARCODER2
27032713

27042714

2705-
@Model.register("MambaForCausalLM", "MambaLMHeadModel")
2715+
@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
27062716
class MambaModel(Model):
27072717
model_arch = gguf.MODEL_ARCH.MAMBA
27082718

@@ -2733,20 +2743,24 @@ def set_gguf_parameters(self):
27332743
# ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
27342744
dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16)
27352745
rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
2736-
2746+
use_dt_b_c_norm = False
2747+
# For falconmamba we do apply RMS norm on B / DT and C layers
2748+
if self.find_hparam(["model_type"], optional=True) in ("falcon_mamba",):
2749+
use_dt_b_c_norm = True
27372750
# Fail early for models which don't have a block expansion factor of 2
27382751
assert d_inner == 2 * d_model
27392752

27402753
self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
27412754
self.gguf_writer.add_embedding_length(d_model)
27422755
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
27432756
self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
2744-
self.gguf_writer.add_block_count(self.hparams["n_layer"])
2757+
self.gguf_writer.add_block_count(self.block_count)
27452758
self.gguf_writer.add_ssm_conv_kernel(d_conv)
27462759
self.gguf_writer.add_ssm_inner_size(d_inner)
27472760
self.gguf_writer.add_ssm_state_size(d_state)
27482761
self.gguf_writer.add_ssm_time_step_rank(dt_rank)
27492762
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
2763+
self.gguf_writer.add_ssm_dt_b_c_rms(use_dt_b_c_norm) # For classic Mamba we don't apply rms norm on B / DT layers
27502764
self.gguf_writer.add_file_type(self.ftype)
27512765

27522766
_tok_embd = None
@@ -2773,23 +2787,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
27732787

27742788
return [(new_name, data_torch)]
27752789

2776-
def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
2777-
if bid is not None and new_name in (
2778-
self.format_tensor_name(
2779-
n, bid, ".weight" if name.endswith(".weight") else ""
2780-
)
2781-
for n in [
2782-
gguf.MODEL_TENSOR.SSM_CONV1D,
2783-
gguf.MODEL_TENSOR.SSM_X,
2784-
gguf.MODEL_TENSOR.SSM_DT,
2785-
gguf.MODEL_TENSOR.SSM_A,
2786-
gguf.MODEL_TENSOR.SSM_D,
2787-
]
2788-
):
2789-
return gguf.GGMLQuantizationType.F32
2790-
2791-
return super().tensor_force_quant(name, new_name, bid, n_dims)
2792-
27932790

27942791
@Model.register("CohereForCausalLM")
27952792
class CommandR2Model(Model):
@@ -3734,8 +3731,120 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
37343731
name = name.removeprefix("transformer.")
37353732
return [(self.map_tensor_name(name), data_torch)]
37363733

3737-
###### CONVERSION LOGIC ######
37383734

3735+
@Model.register("NemotronForCausalLM")
3736+
class NemotronModel(Model):
3737+
model_arch = gguf.MODEL_ARCH.NEMOTRON
3738+
3739+
def set_vocab(self):
3740+
self._set_vocab_sentencepiece()
3741+
self.gguf_writer.add_pad_token_id(0)
3742+
self.gguf_writer.add_unk_token_id(1)
3743+
3744+
def set_gguf_parameters(self):
3745+
super().set_gguf_parameters()
3746+
hparams = self.hparams
3747+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
3748+
3749+
f_norm_eps = self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon", "norm_eps"])
3750+
self.gguf_writer.add_layer_norm_eps(f_norm_eps)
3751+
3752+
# * Partial RoPE
3753+
rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"])
3754+
n_embd = self.find_hparam(["hidden_size", "n_embd"])
3755+
n_head = self.find_hparam(["num_attention_heads", "n_head"])
3756+
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
3757+
3758+
# * RopeScaling for Nemotron
3759+
if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None:
3760+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
3761+
else:
3762+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
3763+
self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"])
3764+
3765+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3766+
# * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side
3767+
# model.layers.{l}.input_layernorm.weight
3768+
# model.layers.{l}.post_attention_layernorm.weight
3769+
# model.norm.weight
3770+
if name.endswith("norm.weight"):
3771+
data_torch = data_torch + 1
3772+
3773+
return [(self.map_tensor_name(name), data_torch)]
3774+
3775+
3776+
@Model.register("ExaoneForCausalLM")
3777+
class ExaoneModel(Model):
3778+
model_arch = gguf.MODEL_ARCH.EXAONE
3779+
3780+
def set_gguf_parameters(self):
3781+
hparams = self.hparams
3782+
3783+
assert (hparams["activation_function"] == "silu")
3784+
3785+
max_position_embeddings = hparams["max_position_embeddings"]
3786+
embed_dim = hparams["hidden_size"]
3787+
num_heads = hparams["num_attention_heads"]
3788+
num_kv_heads = hparams.get("num_key_value_heads", num_heads)
3789+
layer_norm_eps = hparams["layer_norm_epsilon"]
3790+
intermediate_size = hparams["intermediate_size"] if "intermediate_size" in hparams else 4 * embed_dim
3791+
num_layers = hparams["num_layers"]
3792+
# ignore for now as EXAONE-3.0-7.8B-Instruct attentino_dropout is 0.0
3793+
# attention_dropout_rate = hparams["attention_dropout"]
3794+
# ignore for now as EXAONE-3.0-7.8B-Instruct embed_dropout is 0.0
3795+
# embed_dropout_rate = hparams["embed_dropout"]
3796+
self.gguf_writer.add_embedding_length(embed_dim)
3797+
self.gguf_writer.add_head_count(num_heads)
3798+
self.gguf_writer.add_head_count_kv(num_kv_heads)
3799+
self.gguf_writer.add_context_length(max_position_embeddings)
3800+
self.gguf_writer.add_layer_norm_rms_eps(layer_norm_eps)
3801+
self.gguf_writer.add_feed_forward_length(intermediate_size)
3802+
self.gguf_writer.add_block_count(num_layers)
3803+
self.gguf_writer.add_file_type(self.ftype)
3804+
3805+
if (rope_theta := self.hparams.get("rope_theta")) is not None:
3806+
self.gguf_writer.add_rope_freq_base(rope_theta)
3807+
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
3808+
rotary_factor = rotary_factor if rotary_factor is not None else 1.0
3809+
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
3810+
if hparams.get("rope_scaling") is not None and "factor" in hparams["rope_scaling"]:
3811+
if hparams["rope_scaling"].get("type") == "linear":
3812+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
3813+
self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
3814+
3815+
def prepare_tensors(self):
3816+
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
3817+
if rope_scaling.get("rope_type", '').lower() == "llama3":
3818+
base = self.hparams.get("rope_theta", 10000.0)
3819+
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
3820+
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
3821+
3822+
factor = rope_scaling.get("factor", 8.0)
3823+
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
3824+
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
3825+
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
3826+
3827+
low_freq_wavelen = old_context_len / low_freq_factor
3828+
high_freq_wavelen = old_context_len / high_freq_factor
3829+
assert low_freq_wavelen != high_freq_wavelen
3830+
3831+
rope_factors = []
3832+
for freq in freqs:
3833+
wavelen = 2 * math.pi / freq
3834+
if wavelen < high_freq_wavelen:
3835+
rope_factors.append(1)
3836+
elif wavelen > low_freq_wavelen:
3837+
rope_factors.append(factor)
3838+
else:
3839+
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
3840+
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
3841+
3842+
self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
3843+
3844+
super().prepare_tensors()
3845+
3846+
3847+
###### CONVERSION LOGIC ######
37393848

37403849
# tree of lazy tensors
37413850
class LazyTorchTensor(gguf.LazyBase):

convert_hf_to_gguf_update.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,9 @@ class TOKENIZER_TYPE(IntEnum):
9494
{"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
9595
{"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
9696
{"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
97+
{'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
98+
{'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
99+
{"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
97100
]
98101

99102

convert_llama_ggml_to_gguf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ def load(self, data, offset):
116116
assert quant is not None, 'Unknown tensor type'
117117
(blksize, tysize) = quant
118118
offset += 12
119-
self.dtype= dtype
119+
self.dtype= gguf.GGMLQuantizationType(dtype)
120120
self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
121121
offset += 4 * n_dims
122122
self.name = bytes(data[offset:offset + name_len])

0 commit comments

Comments
 (0)