MaggotHATE
diff --git a/‎Makefile
Lines changed: 24 additions & 6 deletions b/‎Makefile
Lines changed: 24 additions & 6 deletions
diff --git a/‎base_sampling2/common.cpp
Lines changed: 5 additions & 0 deletions b/‎base_sampling2/common.cpp
Lines changed: 5 additions & 0 deletions
diff --git a/‎base_sampling2/llama-addon.cpp
Lines changed: 9 additions & 7 deletions b/‎base_sampling2/llama-addon.cpp
Lines changed: 9 additions & 7 deletions
diff --git a/‎base_sampling2/master/ggml/include/ggml-backend.h
Lines changed: 1 addition & 1 deletion b/‎base_sampling2/master/ggml/include/ggml-backend.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎base_sampling2/master/ggml/include/ggml-cpu.h
Lines changed: 2 additions & 0 deletions b/‎base_sampling2/master/ggml/include/ggml-cpu.h
Lines changed: 2 additions & 0 deletions
@@ -14,6 +14,9 @@
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = chat chat_cl chat_vk
 
+GGML_VERSION = 0
+GGML_COMMIT = 0
+
 #CXX = g++
 #CXX = clang++
 
@@ -93,6 +96,8 @@ ifdef LLAMA_SHARED
 FLAG_S = shared
 endif
 
+VERSIONS = -DGGML_VERSION -DGGML_COMMIT
+
 ifdef AVX
 ARCH = -march=core-avx-i -mtune=core-avx-i -mavx
 ARCH_NAME = _AVX
@@ -117,7 +122,8 @@ PREFIX_A = master
 # ggml
 # ggmlsrc_f_h = $(base)/ggml
 # ggmlsrc_f_s = $(base)/ggml
-ggmlsrc_f = $(base)/$(PREFIX_A)/ggml
+ggmlsrc = $(base)/$(PREFIX_A)
+ggmlsrc_f = $(ggmlsrc)/ggml
 ggmlsrc_f_h = $(ggmlsrc_f)/include
 ggmlsrc_f_s = $(ggmlsrc_f)/src
 # backends
@@ -260,7 +266,7 @@ C_WARNS = -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Wer
 CPP_WARNS = -Wshadow -Wmissing-declarations -Wmissing-noreturn
 
 #for main ui
-CXXFLAGS_UI += $(OPT_UI) -std=$(CCPP) -fPIC -DNDEBUG $(ARCH) -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS -w
+CXXFLAGS_UI += $(OPT_UI) -std=$(CCPP) -fPIC -DNDEBUG $(ARCH) $(VERSIONS) -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS -w
 CXXFLAGS_UI += -I$(IMGUI_DIR) -I$(IMGUI_DIR)/backends
 CXXFLAGS_UI += -g -Wall -Wformat -pipe
 
@@ -269,10 +275,10 @@ CXXFLAGS_UI += -DSDL2
 endif
 
 #for general ggml-gguf
-CFLAGS = $(I_GGUF) $(OPTC) -std=$(CCC) -fPIC $(GNUPDATEC) -DNDEBUG $(ARCH) -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS -w $(WARNS) $(C_WARNS) -pipe
+CFLAGS = $(I_GGUF) $(OPTC) -std=$(CCC) -fPIC $(GNUPDATEC) -DNDEBUG $(ARCH) $(VERSIONS) -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS -w $(WARNS) $(C_WARNS) -pipe
 
 #for all chatTest
-CXXFLAGS = $(I_GGUF) $(OPT) -std=$(CCPP) $(GNUPDATECXX) -fPIC -DNDEBUG $(ARCH) -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS -w $(WARNS) $(CPP_WARNS) -pipe
+CXXFLAGS = $(I_GGUF) $(OPT) -std=$(CCPP) $(GNUPDATECXX) -fPIC -DNDEBUG $(ARCH) $(VERSIONS) -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS -w $(WARNS) $(CPP_WARNS) -pipe
 
 # The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
 # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
@@ -839,6 +845,10 @@ chatTest_vk:
 	$(MAKE) chatTest VULKAN=1
 	@echo Using VULKAN
 
+test-vk-ops:
+	$(MAKE) test-backend-ops VULKAN=1
+	@echo Using VULKAN
+
 # aggregates
 
 chats: chat chat_ob chat_cl chat_vk
@@ -877,6 +887,10 @@ $(chatTest_cpu):$(TMP)$(PREFIX)_class_chat.o $(OBJS_GGUF)
 	@echo ARCH = $(ARCH)
 	$(CXX) $(I_GGUF) $(filter-out %.h,$^) $(LDFLAGS) -o $@ $(CXXFLAGS)
 
+test-backend-ops:$(ggmlsrc)/tests/test-backend-ops.cpp $(OBJS_GGUF)
+	@echo ARCH = $(ARCH)
+	$(CXX) $(I_GGUF) $(filter-out %.h,$^) $(LDFLAGS) -o $@ $(CXXFLAGS)
+
 dualTest:$(TMP)$(PREFIX)_dual_chat.o $(OBJS_GGUF)
 	$(CXX) $(I_GGUF) $(CXXFLAGS) $(filter-out %.h,$^) $(LDFLAGS) -o $@
 
@@ -894,7 +908,11 @@ $(EXE_VK)_mini:$(MAIN_CPP) llama_chat1.res $(OBJS) $(OBJS_VK)
 
 $(chatTest_vk):$(conapp) $(OBJS_VK)
 	#$(CXX)  $(I_GGUF) $(CXXFLAGS_VK) $(filter-out %.h,$^) $(LDFLAGS_VK) $(LDFLAGS_VK+) -o $@
-	$(CXX)  $(I_GGUF) $(CXXFLAGS_VK) -c $< -o $(call GET_OBJ_FILE1, $<)
-	$(CXX)  $(I_GGUF) $(CXXFLAGS_VK) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE1, $<) -o $@ $(LDFLAGS_VK) $(LDFLAGS_VK+)
+	$(CXX) $(I_GGUF) $(CXXFLAGS_VK) -c $< -o $(call GET_OBJ_FILE1, $<)
+	$(CXX) $(I_GGUF) $(CXXFLAGS_VK) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE1, $<) -o $@ $(LDFLAGS_VK) $(LDFLAGS_VK+)
 #-
 
+$(test-vk-ops):$(ggmlsrc)/tests/test-backend-ops.cpp $(OBJS_VK)
+	$(CXX) $(I_GGUF) $(CXXFLAGS_VK) -c $< -o $(call GET_OBJ_FILE1, $<)
+	$(CXX) $(I_GGUF) $(CXXFLAGS_VK) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE1, $<) -o $@ $(LDFLAGS_VK) $(LDFLAGS_VK+)
+
@@ -1378,6 +1378,11 @@ std::vector<llama_token> common_tokenize(
     int n_tokens = text.length() + 2 * add_special;
     std::vector<llama_token> result(n_tokens);
     n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+
+    if (n_tokens == std::numeric_limits<int32_t>::min()) {
+        throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
+    }
+
     if (n_tokens < 0) {
         result.resize(-n_tokens);
         int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
 
@@ -681,8 +681,7 @@ static void llama_sampler_min_p_addon_apply(struct llama_sampler * smpl, llama_t
         if (!filtered_tokens.empty() && filtered_tokens.size() >= ctx->min_keep) {
             memcpy(cur_p->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
             cur_p->size = filtered_tokens.size();
-            // Guard against a single choice
-            if (cur_p->size < 2) cur_p->size = 2;
+            // Cannot guard against a single choice here due to memcpy
             min_p_applied = true;
         }
     }
@@ -709,7 +708,7 @@ static void llama_sampler_min_p_addon_apply(struct llama_sampler * smpl, llama_t
         }
 
         // Guard against a single choice
-        if (i < 2) i = 2;
+        if (i < ctx->min_keep) i = ctx->min_keep;
 
         // Resize the output vector to keep only the matching tokens
         cur_p->size = i;
@@ -1123,13 +1122,15 @@ void llama_sample_p_step_addon_apply(struct llama_sampler * smpl, llama_token_da
             step_found = true;
         }
 
-        if (step_found && i >= ctx->min_keep) {
+        if (step_found) {
             // Resize the output vector to keep only the tokens before the step
-            candidates->size = i;
+            if (i > ctx->min_keep) candidates->size = i;
+            else candidates->size = ctx->min_keep;
 
             break;
         }
     }
+
     p_step_total = candidates->size;
 }
 
@@ -1661,8 +1662,9 @@ static void llama_sampler_tail_free_apply(struct llama_sampler * smpl, llama_tok
         cum_sum += second_derivatives[i];
 
         // Check if the running sum is greater than z or if we have kept at least min_keep tokens
-        if (cum_sum > ctx->z && i >= ctx->min_keep) {
-            last_idx = i;
+        if (cum_sum > ctx->z) {
+            if (i > ctx->min_keep) last_idx = i;
+            else last_idx = ctx->min_keep;
             break;
         }
     }
 
@@ -339,7 +339,7 @@ extern "C" {
     typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
 
     // Compare the output of two backends
-    GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
+    GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor * test_node);
 
     // Tensor initialization
     GGML_API enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
 
@@ -101,6 +101,7 @@ extern "C" {
     GGML_BACKEND_API int ggml_cpu_has_riscv_v    (void);
     GGML_BACKEND_API int ggml_cpu_has_vsx        (void);
     GGML_BACKEND_API int ggml_cpu_has_vxe        (void);
+    GGML_BACKEND_API int ggml_cpu_has_nnpa       (void);
     GGML_BACKEND_API int ggml_cpu_has_wasm_simd  (void);
     GGML_BACKEND_API int ggml_cpu_has_llamafile  (void);
 
@@ -133,6 +134,7 @@ extern "C" {
 
     GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
 
+    GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *,       float *, int64_t);
     GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
     GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
     GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);
Original file line number	Diff line number	Diff line change
`@@ -681,8 +681,7 @@ static void llama_sampler_min_p_addon_apply(struct llama_sampler * smpl, llama_t`
`681`	`681`	`if (!filtered_tokens.empty() && filtered_tokens.size() >= ctx->min_keep) {`
`682`	`682`	`memcpy(cur_p->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));`
`683`	`683`	`cur_p->size = filtered_tokens.size();`
`684`		`- // Guard against a single choice`
`685`		`- if (cur_p->size < 2) cur_p->size = 2;`
	`684`	`+ // Cannot guard against a single choice here due to memcpy`
`686`	`685`	`min_p_applied = true;`
`687`	`686`	`}`
`688`	`687`	`}`
`@@ -709,7 +708,7 @@ static void llama_sampler_min_p_addon_apply(struct llama_sampler * smpl, llama_t`
`709`	`708`	`}`
`710`	`709`
`711`	`710`	`// Guard against a single choice`
`712`		`- if (i < 2) i = 2;`
	`711`	`+ if (i < ctx->min_keep) i = ctx->min_keep;`
`713`	`712`
`714`	`713`	`// Resize the output vector to keep only the matching tokens`
`715`	`714`	`cur_p->size = i;`
`@@ -1123,13 +1122,15 @@ void llama_sample_p_step_addon_apply(struct llama_sampler * smpl, llama_token_da`
`1123`	`1122`	`step_found = true;`
`1124`	`1123`	`}`
`1125`	`1124`
`1126`		`- if (step_found && i >= ctx->min_keep) {`
	`1125`	`+ if (step_found) {`
`1127`	`1126`	`// Resize the output vector to keep only the tokens before the step`
`1128`		`- candidates->size = i;`
	`1127`	`+ if (i > ctx->min_keep) candidates->size = i;`
	`1128`	`+ else candidates->size = ctx->min_keep;`
`1129`	`1129`
`1130`	`1130`	`break;`
`1131`	`1131`	`}`
`1132`	`1132`	`}`
	`1133`	`+`
`1133`	`1134`	`p_step_total = candidates->size;`
`1134`	`1135`	`}`
`1135`	`1136`
`@@ -1661,8 +1662,9 @@ static void llama_sampler_tail_free_apply(struct llama_sampler * smpl, llama_tok`
`1661`	`1662`	`cum_sum += second_derivatives[i];`
`1662`	`1663`
`1663`	`1664`	`// Check if the running sum is greater than z or if we have kept at least min_keep tokens`
`1664`		`- if (cum_sum > ctx->z && i >= ctx->min_keep) {`
`1665`		`- last_idx = i;`
	`1665`	`+ if (cum_sum > ctx->z) {`
	`1666`	`+ if (i > ctx->min_keep) last_idx = i;`
	`1667`	`+ else last_idx = ctx->min_keep;`
`1666`	`1668`	`break;`
`1667`	`1669`	`}`
`1668`	`1670`	`}`