Skip to content

Commit a4d430e

Browse files
committed
Merge branch 'spacestream' into FixSomeMess
2 parents 1600898 + d136bf0 commit a4d430e

35 files changed

+1447
-788
lines changed

common/common.cpp

Lines changed: 34 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -111,8 +111,34 @@ int32_t cpu_get_num_physical_cores() {
111111
if (result == 0) {
112112
return num_physical_cores;
113113
}
114-
#elif defined(_WIN32)
115-
//TODO: Implement
114+
#elif defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
115+
// TODO: windows + arm64 + mingw64
116+
unsigned int n_threads_win = std::thread::hardware_concurrency();
117+
unsigned int default_threads = n_threads_win > 0 ? (n_threads_win <= 4 ? n_threads_win : n_threads_win / 2) : 4;
118+
119+
DWORD buffer_size = 0;
120+
if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &buffer_size)) {
121+
if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
122+
return default_threads;
123+
}
124+
}
125+
126+
std::vector<char> buffer(buffer_size);
127+
if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data()), &buffer_size)) {
128+
return default_threads;
129+
}
130+
131+
int32_t num_physical_cores = 0;
132+
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data());
133+
while (buffer_size > 0) {
134+
if (info->Relationship == RelationProcessorCore) {
135+
num_physical_cores += info->Processor.GroupCount;
136+
}
137+
buffer_size -= info->Size;
138+
info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(reinterpret_cast<char*>(info) + info->Size);
139+
}
140+
141+
return num_physical_cores > 0 ? num_physical_cores : default_threads;
116142
#endif
117143
unsigned int n_threads = std::thread::hardware_concurrency();
118144
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
@@ -1760,7 +1786,13 @@ std::string gpt_params_get_system_info(const gpt_params & params) {
17601786
if (params.n_threads_batch != -1) {
17611787
os << " (n_threads_batch = " << params.n_threads_batch << ")";
17621788
}
1789+
#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
1790+
// TODO: windows + arm64 + mingw64
1791+
DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
1792+
os << " / " << logicalProcessorCount << " | " << llama_print_system_info();
1793+
#else
17631794
os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
1795+
#endif
17641796

17651797
return os.str();
17661798
}
@@ -2735,12 +2767,6 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
27352767
return text;
27362768
}
27372769

2738-
bool llama_should_add_bos_token(const llama_model * model) {
2739-
const int add_bos = llama_add_bos_token(model);
2740-
2741-
return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
2742-
}
2743-
27442770
//
27452771
// Chat template utils
27462772
//

common/common.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -404,10 +404,6 @@ std::string llama_detokenize(
404404
const std::vector<llama_token> & tokens,
405405
bool special = true);
406406

407-
// Uses the value from the model metadata if possible, otherwise
408-
// defaults to true when model type is SPM, otherwise false.
409-
bool llama_should_add_bos_token(const llama_model * model);
410-
411407
//
412408
// Chat template utils
413409
//

convert_hf_to_gguf.py

Lines changed: 122 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -608,6 +608,15 @@ def get_vocab_base_pre(self, tokenizer) -> str:
608608
if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
609609
# ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
610610
res = "smollm"
611+
if chkhsh == "3c30d3ad1d6b64202cd222813e7736c2db6e1bd6d67197090fc1211fbc612ae7":
612+
# ref: https://huggingface.co/bigscience/bloom
613+
res = "bloom"
614+
if chkhsh == "bc01ce58980e1db43859146dc51b1758b3b88729b217a74792e9f8d43e479d21":
615+
# ref: https://huggingface.co/TurkuNLP/gpt3-finnish-small
616+
res = "gpt3-finnish"
617+
if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
618+
# ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
619+
res = "exaone"
611620

612621
if res is None:
613622
logger.warning("\n")
@@ -911,7 +920,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
911920
return tensors
912921

913922

914-
@Model.register("BloomForCausalLM")
923+
@Model.register("BloomForCausalLM", "BloomModel")
915924
class BloomModel(Model):
916925
model_arch = gguf.MODEL_ARCH.BLOOM
917926

@@ -3751,6 +3760,118 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
37513760
name = name.removeprefix("transformer.")
37523761
return [(self.map_tensor_name(name), data_torch)]
37533762

3763+
3764+
@Model.register("NemotronForCausalLM")
3765+
class NemotronModel(Model):
3766+
model_arch = gguf.MODEL_ARCH.NEMOTRON
3767+
3768+
def set_vocab(self):
3769+
self._set_vocab_sentencepiece()
3770+
self.gguf_writer.add_pad_token_id(0)
3771+
self.gguf_writer.add_unk_token_id(1)
3772+
3773+
def set_gguf_parameters(self):
3774+
super().set_gguf_parameters()
3775+
hparams = self.hparams
3776+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
3777+
3778+
f_norm_eps = self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon", "norm_eps"])
3779+
self.gguf_writer.add_layer_norm_eps(f_norm_eps)
3780+
3781+
# * Partial RoPE
3782+
rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"])
3783+
n_embd = self.find_hparam(["hidden_size", "n_embd"])
3784+
n_head = self.find_hparam(["num_attention_heads", "n_head"])
3785+
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
3786+
3787+
# * RopeScaling for Nemotron
3788+
if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None:
3789+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
3790+
else:
3791+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
3792+
self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"])
3793+
3794+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3795+
# * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side
3796+
# model.layers.{l}.input_layernorm.weight
3797+
# model.layers.{l}.post_attention_layernorm.weight
3798+
# model.norm.weight
3799+
if name.endswith("norm.weight"):
3800+
data_torch = data_torch + 1
3801+
3802+
return [(self.map_tensor_name(name), data_torch)]
3803+
3804+
3805+
@Model.register("ExaoneForCausalLM")
3806+
class ExaoneModel(Model):
3807+
model_arch = gguf.MODEL_ARCH.EXAONE
3808+
3809+
def set_gguf_parameters(self):
3810+
hparams = self.hparams
3811+
3812+
assert(hparams["activation_function"] == "silu")
3813+
3814+
max_position_embeddings = hparams["max_position_embeddings"]
3815+
embed_dim = hparams["hidden_size"]
3816+
num_heads = hparams["num_attention_heads"]
3817+
num_kv_heads = hparams.get("num_key_value_heads", num_heads)
3818+
layer_norm_eps = hparams["layer_norm_epsilon"]
3819+
intermediate_size = hparams["intermediate_size"] if "intermediate_size" in hparams else 4 * embed_dim
3820+
num_layers = hparams["num_layers"]
3821+
# ignore for now as EXAONE-3.0-7.8B-Instruct attentino_dropout is 0.0
3822+
# attention_dropout_rate = hparams["attention_dropout"]
3823+
# ignore for now as EXAONE-3.0-7.8B-Instruct embed_dropout is 0.0
3824+
# embed_dropout_rate = hparams["embed_dropout"]
3825+
self.gguf_writer.add_embedding_length(embed_dim)
3826+
self.gguf_writer.add_head_count(num_heads)
3827+
self.gguf_writer.add_head_count_kv(num_kv_heads)
3828+
self.gguf_writer.add_context_length(max_position_embeddings)
3829+
self.gguf_writer.add_layer_norm_rms_eps(layer_norm_eps)
3830+
self.gguf_writer.add_feed_forward_length(intermediate_size)
3831+
self.gguf_writer.add_block_count(num_layers)
3832+
self.gguf_writer.add_file_type(self.ftype)
3833+
3834+
if (rope_theta := self.hparams.get("rope_theta")) is not None:
3835+
self.gguf_writer.add_rope_freq_base(rope_theta)
3836+
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
3837+
rotary_factor = rotary_factor if rotary_factor is not None else 1.0
3838+
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
3839+
if hparams.get("rope_scaling") is not None and "factor" in hparams["rope_scaling"]:
3840+
if hparams["rope_scaling"].get("type") == "linear":
3841+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
3842+
self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
3843+
3844+
def prepare_tensors(self):
3845+
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
3846+
if rope_scaling.get("rope_type", '').lower() == "llama3":
3847+
base = self.hparams.get("rope_theta", 10000.0)
3848+
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
3849+
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
3850+
3851+
factor = rope_scaling.get("factor", 8.0)
3852+
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
3853+
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
3854+
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
3855+
3856+
low_freq_wavelen = old_context_len / low_freq_factor
3857+
high_freq_wavelen = old_context_len / high_freq_factor
3858+
assert low_freq_wavelen != high_freq_wavelen
3859+
3860+
rope_factors = []
3861+
for freq in freqs:
3862+
wavelen = 2 * math.pi / freq
3863+
if wavelen < high_freq_wavelen:
3864+
rope_factors.append(1)
3865+
elif wavelen > low_freq_wavelen:
3866+
rope_factors.append(factor)
3867+
else:
3868+
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
3869+
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
3870+
3871+
self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
3872+
3873+
super().prepare_tensors()
3874+
37543875
###### CONVERSION LOGIC ######
37553876

37563877

convert_hf_to_gguf_update.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,9 @@ class TOKENIZER_TYPE(IntEnum):
9494
{"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
9595
{"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
9696
{"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
97+
{'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
98+
{'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
99+
{"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
97100
]
98101

99102

examples/cvector-generator/cvector-generator.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,7 @@ struct tokenized_prompt {
271271
size_t max_seq_len;
272272

273273
tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
274-
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
274+
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
275275
tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true);
276276
tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true);
277277
max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());

examples/eval-callback/eval-callback.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
127127
}
128128

129129
static bool run(llama_context * ctx, const gpt_params & params) {
130-
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
130+
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
131131

132132
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
133133

examples/imatrix/imatrix.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -435,8 +435,8 @@ static void process_logits(
435435
}
436436

437437
static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
438-
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
439-
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
438+
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
439+
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
440440
const int n_ctx = llama_n_ctx(ctx);
441441

442442
auto tim1 = std::chrono::high_resolution_clock::now();

examples/infill/infill.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -204,8 +204,8 @@ int main(int argc, char ** argv) {
204204
LOG_TEE("\n");
205205
LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
206206
}
207-
const bool add_bos = llama_should_add_bos_token(model);
208-
GGML_ASSERT(llama_add_eos_token(model) != 1);
207+
const bool add_bos = llama_add_bos_token(model);
208+
GGML_ASSERT(!llama_add_eos_token(model));
209209
LOG("add_bos: %d\n", add_bos);
210210

211211
std::vector<llama_token> embd_inp;

examples/main/main.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -268,9 +268,9 @@ int main(int argc, char ** argv) {
268268
}
269269
}
270270

271-
const bool add_bos = llama_should_add_bos_token(model);
271+
const bool add_bos = llama_add_bos_token(model);
272272
if (!llama_model_has_encoder(model)) {
273-
GGML_ASSERT(llama_add_eos_token(model) != 1);
273+
GGML_ASSERT(!llama_add_eos_token(model));
274274
}
275275
LOG("add_bos: %d\n", add_bos);
276276

examples/perplexity/perplexity.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -341,8 +341,8 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
341341
// Output: `perplexity: 13.5106 [114/114]`
342342
// BOS tokens will be added for each chunk before eval
343343

344-
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
345-
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
344+
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
345+
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
346346

347347
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
348348

@@ -481,8 +481,8 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
481481
// Output: `perplexity: 13.5106 [114/114]`
482482
// BOS tokens will be added for each chunk before eval
483483

484-
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
485-
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
484+
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
485+
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
486486

487487
std::ofstream logits_stream;
488488
if (!params.logits_file.empty()) {
@@ -1734,8 +1734,8 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
17341734
const int n_batch = params.n_batch;
17351735
const int num_batches = (n_ctx + n_batch - 1)/n_batch;
17361736
const int nv = 2*((n_vocab + 1)/2) + 4;
1737-
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
1738-
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
1737+
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
1738+
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
17391739

17401740
std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
17411741
std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);

0 commit comments

Comments
 (0)