Skip to content

Commit 208ca76

Browse files
committed
chore: sync llama.cpp @ ggml-org/llama.cpp#12288
1 parent 5dc97cf commit 208ca76

File tree

18 files changed

+57
-33
lines changed

18 files changed

+57
-33
lines changed

ios/llama.cpp/.github/workflows/build.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1379,7 +1379,7 @@ jobs:
13791379
id: pack_artifacts
13801380
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
13811381
run: |
1382-
zip -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
1382+
zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
13831383
13841384
- name: Upload artifacts
13851385
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}

ios/llama.cpp/common/build-info.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
int LLAMA_BUILD_NUMBER = 4882;
2-
char const *LLAMA_COMMIT = "be7c3034";
1+
int LLAMA_BUILD_NUMBER = 4905;
2+
char const *LLAMA_COMMIT = "f74aee02";
33
char const *LLAMA_COMPILER = "Apple clang version 16.0.0 (clang-1600.0.26.6)";
44
char const *LLAMA_BUILD_TARGET = "arm64-apple-darwin24.2.0";

ios/llama.cpp/common/chat.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1414,7 +1414,6 @@ static common_chat_msg common_chat_parse_phi_4(const std::string & input) {
14141414
return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
14151415
}
14161416

1417-
14181417
static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) {
14191418
common_chat_params data;
14201419
// (content)?(<tool_call>{"name": "foo", "arguments": {"a": 1}}</tool_call>)*

ios/llama.cpp/src/llama-context.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -285,11 +285,15 @@ llama_context::llama_context(
285285

286286
// reserve worst-case graph
287287
if (!hparams.vocab_only) {
288-
uint32_t n_seqs = 1; // TODO: worst-case number of sequences
289-
uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
288+
const uint32_t n_seqs = 1; // TODO: worst-case number of sequences
289+
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
290290

291291
llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
292292

293+
// restore later
294+
// TODO: something cleaner
295+
const auto n_outputs_save = n_outputs;
296+
293297
// max number of outputs
294298
n_outputs = n_tokens;
295299

@@ -341,6 +345,8 @@ llama_context::llama_context(
341345
}
342346
}
343347

348+
n_outputs = n_outputs_save;
349+
344350
for (size_t i = 0; i < backend_ptrs.size(); ++i) {
345351
ggml_backend_t backend = backend_ptrs[i];
346352
ggml_backend_buffer_type_t buft = backend_buft[i];

ios/llama.cpp/src/llama-model.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1005,6 +1005,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
10051005
case 16: type = LLM_TYPE_1B; break;
10061006
case 32: type = LLM_TYPE_7B; break;
10071007
case 40: type = LLM_TYPE_13B; break;
1008+
case 64: type = LLM_TYPE_32B; break;
10081009
default: type = LLM_TYPE_UNKNOWN;
10091010
}
10101011
} break;
@@ -2726,6 +2727,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
27262727
} break;
27272728
case LLM_ARCH_OLMO2:
27282729
{
2730+
const int64_t n_embd_head = n_embd / n_head;
2731+
27292732
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
27302733

27312734
// output
@@ -2740,7 +2743,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
27402743
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
27412744
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
27422745
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
2743-
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
2746+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_head_kv * n_embd_head}, 0);
27442747
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
27452748

27462749
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);

ios/llama.cpp/tests/test-chat.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -821,7 +821,7 @@ static void test_template_output_parsers() {
821821
"{\"name\": \"special_function\", \"parameters\": {\"arg1\": 1}}");
822822
}
823823
{
824-
auto tmpls = read_templates("models/templates/microsoft-Phi-4-mini-instruct.jinja");
824+
auto tmpls = read_templates("models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja");
825825
std::vector<std::string> end_tokens{ "<|end|>" };
826826

827827
assert_equals(COMMON_CHAT_FORMAT_PHI_4, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
@@ -833,21 +833,21 @@ static void test_template_output_parsers() {
833833
assert_msg_equals(
834834
common_chat_msg{"assistant", "I'll help with that.", {}, tool_calls, "", "", ""},
835835
common_chat_parse(
836-
"I'll help with that.<|tool_call|>{\"name\":\"special_function\",\"arguments\":{\"arg1\":1}}</|tool_call|>",
836+
"I'll help with that.<|tool_call|>{\"name\":\"special_function\",\"arguments\":{\"arg1\":1}}<|/tool_call|>",
837837
COMMON_CHAT_FORMAT_PHI_4));
838838

839839
// Test with content after tool call
840840
assert_msg_equals(
841841
common_chat_msg{"assistant", "I'll help with that.", {}, tool_calls, "", "", ""},
842842
common_chat_parse(
843-
"<|tool_call|>{\"name\":\"special_function\",\"arguments\":{\"arg1\":1}}</|tool_call|>I'll help with that.",
843+
"<|tool_call|>{\"name\":\"special_function\",\"arguments\":{\"arg1\":1}}<|/tool_call|>I'll help with that.",
844844
COMMON_CHAT_FORMAT_PHI_4));
845845

846846
// Test with newlines.
847847
assert_msg_equals(message_assist_call, common_chat_parse(
848848
"<|tool_call|>\n"
849849
"{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
850-
"</|tool_call|>",
850+
"<|/tool_call|>",
851851
COMMON_CHAT_FORMAT_PHI_4));
852852
}
853853
{

macos/llama.cpp/.github/workflows/build.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1379,7 +1379,7 @@ jobs:
13791379
id: pack_artifacts
13801380
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
13811381
run: |
1382-
zip -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
1382+
zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
13831383
13841384
- name: Upload artifacts
13851385
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}

macos/llama.cpp/common/build-info.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
int LLAMA_BUILD_NUMBER = 4882;
2-
char const *LLAMA_COMMIT = "be7c3034";
1+
int LLAMA_BUILD_NUMBER = 4905;
2+
char const *LLAMA_COMMIT = "f74aee02";
33
char const *LLAMA_COMPILER = "Apple clang version 16.0.0 (clang-1600.0.26.6)";
44
char const *LLAMA_BUILD_TARGET = "arm64-apple-darwin24.2.0";

macos/llama.cpp/common/chat.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1414,7 +1414,6 @@ static common_chat_msg common_chat_parse_phi_4(const std::string & input) {
14141414
return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
14151415
}
14161416

1417-
14181417
static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) {
14191418
common_chat_params data;
14201419
// (content)?(<tool_call>{"name": "foo", "arguments": {"a": 1}}</tool_call>)*

macos/llama.cpp/src/llama-context.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -285,11 +285,15 @@ llama_context::llama_context(
285285

286286
// reserve worst-case graph
287287
if (!hparams.vocab_only) {
288-
uint32_t n_seqs = 1; // TODO: worst-case number of sequences
289-
uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
288+
const uint32_t n_seqs = 1; // TODO: worst-case number of sequences
289+
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
290290

291291
llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
292292

293+
// restore later
294+
// TODO: something cleaner
295+
const auto n_outputs_save = n_outputs;
296+
293297
// max number of outputs
294298
n_outputs = n_tokens;
295299

@@ -341,6 +345,8 @@ llama_context::llama_context(
341345
}
342346
}
343347

348+
n_outputs = n_outputs_save;
349+
344350
for (size_t i = 0; i < backend_ptrs.size(); ++i) {
345351
ggml_backend_t backend = backend_ptrs[i];
346352
ggml_backend_buffer_type_t buft = backend_buft[i];

0 commit comments

Comments
 (0)