Skip to content

Commit 948dfbf

Browse files
slarenarthw
authored andcommitted
quantize : fix --keep-split (ggml-org#10114)
1 parent b5402b2 commit 948dfbf

File tree

1 file changed

+30
-23
lines changed

1 file changed

+30
-23
lines changed

src/llama.cpp

Lines changed: 30 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -4864,19 +4864,12 @@ struct llama_model_loader {
48644864
*last = 0;
48654865
*addr = mapping->addr;
48664866
for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
4867-
try {
4868-
const auto * weight = get_weight(ggml_get_name(tensor));
4869-
if (!weight) {
4870-
continue;
4871-
}
4872-
if (weight->idx != idx) {
4873-
continue;
4874-
}
4875-
*first = std::min(*first, weight->offs);
4876-
*last = std::max(*last, weight->offs + ggml_nbytes(tensor));
4877-
} catch(...) {
4878-
// the tensor is not in the model
4867+
const auto * weight = get_weight(ggml_get_name(tensor));
4868+
if (!weight || weight->idx != idx) {
4869+
continue;
48794870
}
4871+
*first = std::min(*first, weight->offs);
4872+
*last = std::max(*last, weight->offs + ggml_nbytes(tensor));
48804873
}
48814874
}
48824875

@@ -5053,7 +5046,6 @@ struct llama_model_loader {
50535046
ggml_backend_tensor_set(cur, data, 0, n_size);
50545047
}
50555048
} else {
5056-
GGML_ASSERT(weight->idx < files.size());
50575049
const auto & file = files.at(weight->idx);
50585050
if (ggml_backend_buffer_is_host(cur->buffer)) {
50595051
file->seek(weight->offs, SEEK_SET);
@@ -18631,8 +18623,25 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1863118623
}
1863218624
}
1863318625

18626+
// make a list of weights
18627+
std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
18628+
tensors.reserve(ml.weights_map.size());
1863418629
for (const auto & it : ml.weights_map) {
18635-
const struct ggml_tensor * tensor = it.second.tensor;
18630+
tensors.push_back(&it.second);
18631+
}
18632+
18633+
// keep_split requires that the weights are sorted by split index
18634+
if (params->keep_split) {
18635+
std::sort(tensors.begin(), tensors.end(), [](const llama_model_loader::llama_tensor_weight * a, const llama_model_loader::llama_tensor_weight * b) {
18636+
if (a->idx == b->idx) {
18637+
return a->offs < b->offs;
18638+
}
18639+
return a->idx < b->idx;
18640+
});
18641+
}
18642+
18643+
for (const auto * it : tensors) {
18644+
const struct ggml_tensor * tensor = it->tensor;
1863618645

1863718646
const std::string name = ggml_get_name(tensor);
1863818647

@@ -18672,22 +18681,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1867218681
std::vector<no_init<float>> f32_conv_buf;
1867318682

1867418683
uint16_t n_split = 1;
18675-
const auto & weights_map = ml.weights_map;
1867618684

1867718685
// Assume split index is continuous
1867818686
if (params->keep_split) {
18679-
for (const auto & it : weights_map) {
18680-
n_split = std::max(uint16_t(it.second.idx + 1), n_split);
18687+
for (const auto * it : tensors) {
18688+
n_split = std::max(uint16_t(it->idx + 1), n_split);
1868118689
}
18682-
1868318690
}
1868418691
std::vector<gguf_context*> ctx_outs(n_split, NULL);
1868518692
ctx_outs[0] = ctx_out;
1868618693

1868718694
// populate the original tensors so we get an initial meta data
18688-
for (const auto & it : weights_map) {
18689-
uint16_t i_split = params->keep_split ? it.second.idx : 0;
18690-
struct ggml_tensor * tensor = it.second.tensor;
18695+
for (const auto * it : tensors) {
18696+
uint16_t i_split = params->keep_split ? it->idx : 0;
18697+
struct ggml_tensor * tensor = it->tensor;
1869118698
if (ctx_outs[i_split] == NULL) {
1869218699
ctx_outs[i_split] = gguf_init_empty();
1869318700
}
@@ -18734,8 +18741,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1873418741

1873518742
const auto tn = LLM_TN(model.arch);
1873618743
new_ofstream(0);
18737-
for (const auto & it : weights_map) {
18738-
const auto & weight = it.second;
18744+
for (const auto * it : tensors) {
18745+
const auto & weight = *it;
1873918746
struct ggml_tensor * tensor = weight.tensor;
1874018747
if (weight.idx != cur_split && params->keep_split) {
1874118748
close_ofstream();

0 commit comments

Comments
 (0)