chore: sync llama.cpp @ ggml-org/llama.cpp#12288

jpohhhh · jpohhhh · commit 208ca7667b7f · 2025-03-16T16:39:58.000-04:00
diff --git a/ios/llama.cpp/.github/workflows/build.yml b/ios/llama.cpp/.github/workflows/build.yml
@@ -1379,7 +1379,7 @@ jobs:
         id: pack_artifacts
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         run: |
-          zip -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
+          zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
 
       - name: Upload artifacts
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
diff --git a/ios/llama.cpp/common/build-info.cpp b/ios/llama.cpp/common/build-info.cpp
@@ -1,4 +1,4 @@
-int LLAMA_BUILD_NUMBER = 4882;
-char const *LLAMA_COMMIT = "be7c3034";
+int LLAMA_BUILD_NUMBER = 4905;
+char const *LLAMA_COMMIT = "f74aee02";
 char const *LLAMA_COMPILER = "Apple clang version 16.0.0 (clang-1600.0.26.6)";
 char const *LLAMA_BUILD_TARGET = "arm64-apple-darwin24.2.0";
diff --git a/ios/llama.cpp/common/chat.cpp b/ios/llama.cpp/common/chat.cpp
@@ -1414,7 +1414,6 @@ static common_chat_msg common_chat_parse_phi_4(const std::string & input) {
     return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
 }
 
-
 static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) {
     common_chat_params data;
     // (content)?(<tool_call>{"name": "foo", "arguments": {"a": 1}}</tool_call>)*
diff --git a/ios/llama.cpp/src/llama-context.cpp b/ios/llama.cpp/src/llama-context.cpp
@@ -285,11 +285,15 @@ llama_context::llama_context(
 
     // reserve worst-case graph
     if (!hparams.vocab_only) {
-        uint32_t n_seqs = 1; // TODO: worst-case number of sequences
-        uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
+        const uint32_t n_seqs = 1; // TODO: worst-case number of sequences
+        const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
 
         llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
 
+        // restore later
+        // TODO: something cleaner
+        const auto n_outputs_save = n_outputs;
+
         // max number of outputs
         n_outputs = n_tokens;
 
@@ -341,6 +345,8 @@ llama_context::llama_context(
             }
         }
 
+        n_outputs = n_outputs_save;
+
         for (size_t i = 0; i < backend_ptrs.size(); ++i) {
             ggml_backend_t             backend = backend_ptrs[i];
             ggml_backend_buffer_type_t buft    = backend_buft[i];
diff --git a/ios/llama.cpp/src/llama-model.cpp b/ios/llama.cpp/src/llama-model.cpp
@@ -1005,6 +1005,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     case 16: type = LLM_TYPE_1B; break;
                     case 32: type = LLM_TYPE_7B; break;
                     case 40: type = LLM_TYPE_13B; break;
+                    case 64: type = LLM_TYPE_32B; break;
                     default: type = LLM_TYPE_UNKNOWN;
                 }
             } break;
@@ -2726,6 +2727,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                 } break;
             case LLM_ARCH_OLMO2:
                 {
+                    const int64_t n_embd_head = n_embd / n_head;
+
                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output
@@ -2740,7 +2743,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
                         layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
                         layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
-                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_head_kv * n_embd_head}, 0);
                         layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
 
                         layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
diff --git a/ios/llama.cpp/tests/test-chat.cpp b/ios/llama.cpp/tests/test-chat.cpp
@@ -821,7 +821,7 @@ static void test_template_output_parsers() {
                       "{\"name\": \"special_function\", \"parameters\": {\"arg1\": 1}}");
     }
     {
-        auto tmpls = read_templates("models/templates/microsoft-Phi-4-mini-instruct.jinja");
+        auto tmpls = read_templates("models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja");
         std::vector<std::string>   end_tokens{ "<|end|>" };
     
         assert_equals(COMMON_CHAT_FORMAT_PHI_4, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
@@ -833,21 +833,21 @@ static void test_template_output_parsers() {
         assert_msg_equals(
             common_chat_msg{"assistant", "I'll help with that.", {}, tool_calls, "", "", ""},
             common_chat_parse(
-                "I'll help with that.<|tool_call|>{\"name\":\"special_function\",\"arguments\":{\"arg1\":1}}</|tool_call|>",
+                "I'll help with that.<|tool_call|>{\"name\":\"special_function\",\"arguments\":{\"arg1\":1}}<|/tool_call|>",
                 COMMON_CHAT_FORMAT_PHI_4));
 
         // Test with content after tool call
         assert_msg_equals(
             common_chat_msg{"assistant", "I'll help with that.", {}, tool_calls, "", "", ""},
             common_chat_parse(
-                "<|tool_call|>{\"name\":\"special_function\",\"arguments\":{\"arg1\":1}}</|tool_call|>I'll help with that.",
+                "<|tool_call|>{\"name\":\"special_function\",\"arguments\":{\"arg1\":1}}<|/tool_call|>I'll help with that.",
                 COMMON_CHAT_FORMAT_PHI_4));
 
         // Test with newlines.
         assert_msg_equals(message_assist_call, common_chat_parse(
             "<|tool_call|>\n"
             "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
-            "</|tool_call|>",
+            "<|/tool_call|>",
             COMMON_CHAT_FORMAT_PHI_4));
     }
     {
diff --git a/macos/llama.cpp/.github/workflows/build.yml b/macos/llama.cpp/.github/workflows/build.yml
@@ -1379,7 +1379,7 @@ jobs:
         id: pack_artifacts
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         run: |
-          zip -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
+          zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
 
       - name: Upload artifacts
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
diff --git a/macos/llama.cpp/common/build-info.cpp b/macos/llama.cpp/common/build-info.cpp
@@ -1,4 +1,4 @@
-int LLAMA_BUILD_NUMBER = 4882;
-char const *LLAMA_COMMIT = "be7c3034";
+int LLAMA_BUILD_NUMBER = 4905;
+char const *LLAMA_COMMIT = "f74aee02";
 char const *LLAMA_COMPILER = "Apple clang version 16.0.0 (clang-1600.0.26.6)";
 char const *LLAMA_BUILD_TARGET = "arm64-apple-darwin24.2.0";
diff --git a/macos/llama.cpp/common/chat.cpp b/macos/llama.cpp/common/chat.cpp
@@ -1414,7 +1414,6 @@ static common_chat_msg common_chat_parse_phi_4(const std::string & input) {
     return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
 }
 
-
 static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) {
     common_chat_params data;
     // (content)?(<tool_call>{"name": "foo", "arguments": {"a": 1}}</tool_call>)*
diff --git a/macos/llama.cpp/src/llama-context.cpp b/macos/llama.cpp/src/llama-context.cpp
@@ -285,11 +285,15 @@ llama_context::llama_context(
 
     // reserve worst-case graph
     if (!hparams.vocab_only) {
-        uint32_t n_seqs = 1; // TODO: worst-case number of sequences
-        uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
+        const uint32_t n_seqs = 1; // TODO: worst-case number of sequences
+        const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
 
         llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
 
+        // restore later
+        // TODO: something cleaner
+        const auto n_outputs_save = n_outputs;
+
         // max number of outputs
         n_outputs = n_tokens;
 
@@ -341,6 +345,8 @@ llama_context::llama_context(
             }
         }
 
+        n_outputs = n_outputs_save;
+
         for (size_t i = 0; i < backend_ptrs.size(); ++i) {
             ggml_backend_t             backend = backend_ptrs[i];
             ggml_backend_buffer_type_t buft    = backend_buft[i];
diff --git a/macos/llama.cpp/src/llama-model.cpp b/macos/llama.cpp/src/llama-model.cpp
@@ -1005,6 +1005,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     case 16: type = LLM_TYPE_1B; break;
                     case 32: type = LLM_TYPE_7B; break;
                     case 40: type = LLM_TYPE_13B; break;
+                    case 64: type = LLM_TYPE_32B; break;
                     default: type = LLM_TYPE_UNKNOWN;
                 }
             } break;
@@ -2726,6 +2727,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                 } break;
             case LLM_ARCH_OLMO2:
                 {
+                    const int64_t n_embd_head = n_embd / n_head;
+
                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output
@@ -2740,7 +2743,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
                         layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
                         layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
-                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_head_kv * n_embd_head}, 0);
                         layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
 
                         layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
diff --git a/macos/llama.cpp/tests/test-chat.cpp b/macos/llama.cpp/tests/test-chat.cpp
@@ -821,7 +821,7 @@ static void test_template_output_parsers() {
                       "{\"name\": \"special_function\", \"parameters\": {\"arg1\": 1}}");
     }
     {
-        auto tmpls = read_templates("models/templates/microsoft-Phi-4-mini-instruct.jinja");
+        auto tmpls = read_templates("models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja");
         std::vector<std::string>   end_tokens{ "<|end|>" };
     
         assert_equals(COMMON_CHAT_FORMAT_PHI_4, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
@@ -833,21 +833,21 @@ static void test_template_output_parsers() {
         assert_msg_equals(
             common_chat_msg{"assistant", "I'll help with that.", {}, tool_calls, "", "", ""},
             common_chat_parse(
-                "I'll help with that.<|tool_call|>{\"name\":\"special_function\",\"arguments\":{\"arg1\":1}}</|tool_call|>",
+                "I'll help with that.<|tool_call|>{\"name\":\"special_function\",\"arguments\":{\"arg1\":1}}<|/tool_call|>",
                 COMMON_CHAT_FORMAT_PHI_4));
 
         // Test with content after tool call
         assert_msg_equals(
             common_chat_msg{"assistant", "I'll help with that.", {}, tool_calls, "", "", ""},
             common_chat_parse(
-                "<|tool_call|>{\"name\":\"special_function\",\"arguments\":{\"arg1\":1}}</|tool_call|>I'll help with that.",
+                "<|tool_call|>{\"name\":\"special_function\",\"arguments\":{\"arg1\":1}}<|/tool_call|>I'll help with that.",
                 COMMON_CHAT_FORMAT_PHI_4));
 
         // Test with newlines.
         assert_msg_equals(message_assist_call, common_chat_parse(
             "<|tool_call|>\n"
             "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
-            "</|tool_call|>",
+            "<|/tool_call|>",
             COMMON_CHAT_FORMAT_PHI_4));
     }
     {
diff --git a/src/llama.cpp/.github/workflows/build.yml b/src/llama.cpp/.github/workflows/build.yml
@@ -1379,7 +1379,7 @@ jobs:
         id: pack_artifacts
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         run: |
-          zip -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
+          zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
 
       - name: Upload artifacts
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
diff --git a/src/llama.cpp/common/build-info.cpp b/src/llama.cpp/common/build-info.cpp
@@ -1,4 +1,4 @@
-int LLAMA_BUILD_NUMBER = 4882;
-char const *LLAMA_COMMIT = "be7c3034";
+int LLAMA_BUILD_NUMBER = 4905;
+char const *LLAMA_COMMIT = "f74aee02";
 char const *LLAMA_COMPILER = "Apple clang version 16.0.0 (clang-1600.0.26.6)";
 char const *LLAMA_BUILD_TARGET = "arm64-apple-darwin24.2.0";
diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
@@ -1414,7 +1414,6 @@ static common_chat_msg common_chat_parse_phi_4(const std::string & input) {
     return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
 }
 
-
 static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) {
     common_chat_params data;
     // (content)?(<tool_call>{"name": "foo", "arguments": {"a": 1}}</tool_call>)*
diff --git a/src/llama.cpp/src/llama-context.cpp b/src/llama.cpp/src/llama-context.cpp
@@ -285,11 +285,15 @@ llama_context::llama_context(
 
     // reserve worst-case graph
     if (!hparams.vocab_only) {
-        uint32_t n_seqs = 1; // TODO: worst-case number of sequences
-        uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
+        const uint32_t n_seqs = 1; // TODO: worst-case number of sequences
+        const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
 
         llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
 
+        // restore later
+        // TODO: something cleaner
+        const auto n_outputs_save = n_outputs;
+
         // max number of outputs
         n_outputs = n_tokens;
 
@@ -341,6 +345,8 @@ llama_context::llama_context(
             }
         }
 
+        n_outputs = n_outputs_save;
+
         for (size_t i = 0; i < backend_ptrs.size(); ++i) {
             ggml_backend_t             backend = backend_ptrs[i];
             ggml_backend_buffer_type_t buft    = backend_buft[i];
diff --git a/src/llama.cpp/src/llama-model.cpp b/src/llama.cpp/src/llama-model.cpp
@@ -1005,6 +1005,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     case 16: type = LLM_TYPE_1B; break;
                     case 32: type = LLM_TYPE_7B; break;
                     case 40: type = LLM_TYPE_13B; break;
+                    case 64: type = LLM_TYPE_32B; break;
                     default: type = LLM_TYPE_UNKNOWN;
                 }
             } break;
@@ -2726,6 +2727,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                 } break;
             case LLM_ARCH_OLMO2:
                 {
+                    const int64_t n_embd_head = n_embd / n_head;
+
                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output
@@ -2740,7 +2743,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
                         layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
                         layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
-                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_head_kv * n_embd_head}, 0);
                         layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
 
                         layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
diff --git a/src/llama.cpp/tests/test-chat.cpp b/src/llama.cpp/tests/test-chat.cpp
@@ -821,7 +821,7 @@ static void test_template_output_parsers() {
                       "{\"name\": \"special_function\", \"parameters\": {\"arg1\": 1}}");
     }
     {
-        auto tmpls = read_templates("models/templates/microsoft-Phi-4-mini-instruct.jinja");
+        auto tmpls = read_templates("models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja");
         std::vector<std::string>   end_tokens{ "<|end|>" };
     
         assert_equals(COMMON_CHAT_FORMAT_PHI_4, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
@@ -833,21 +833,21 @@ static void test_template_output_parsers() {
         assert_msg_equals(
             common_chat_msg{"assistant", "I'll help with that.", {}, tool_calls, "", "", ""},
             common_chat_parse(
-                "I'll help with that.<|tool_call|>{\"name\":\"special_function\",\"arguments\":{\"arg1\":1}}</|tool_call|>",
+                "I'll help with that.<|tool_call|>{\"name\":\"special_function\",\"arguments\":{\"arg1\":1}}<|/tool_call|>",
                 COMMON_CHAT_FORMAT_PHI_4));
 
         // Test with content after tool call
         assert_msg_equals(
             common_chat_msg{"assistant", "I'll help with that.", {}, tool_calls, "", "", ""},
             common_chat_parse(
-                "<|tool_call|>{\"name\":\"special_function\",\"arguments\":{\"arg1\":1}}</|tool_call|>I'll help with that.",
+                "<|tool_call|>{\"name\":\"special_function\",\"arguments\":{\"arg1\":1}}<|/tool_call|>I'll help with that.",
                 COMMON_CHAT_FORMAT_PHI_4));
 
         // Test with newlines.
         assert_msg_equals(message_assist_call, common_chat_parse(
             "<|tool_call|>\n"
             "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
-            "</|tool_call|>",
+            "<|/tool_call|>",
             COMMON_CHAT_FORMAT_PHI_4));
     }
     {

Original file line number	Diff line number	Diff line change
`@@ -1414,7 +1414,6 @@ static common_chat_msg common_chat_parse_phi_4(const std::string & input) {`
`1414`	`1414`	`return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);`
`1415`	`1415`	`}`
`1416`	`1416`
`1417`		`-`
`1418`	`1417`	`static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) {`
`1419`	`1418`	`common_chat_params data;`
`1420`	`1419`	`// (content)?(<tool_call>{"name": "foo", "arguments": {"a": 1}}</tool_call>)*`