Skip to content

Commit 999a824

Browse files
committed
Latest commits (Vulkan needs fixes)
* see ggml-org/llama.cpp#14366 (comment)
1 parent 9d4e140 commit 999a824

File tree

94 files changed

+8705
-3952
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

94 files changed

+8705
-3952
lines changed

Makefile

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@
1414
# Define the default target now so that it is always the first target
1515
BUILD_TARGETS = chat chat_cl chat_vk
1616

17+
GGML_VERSION = 0
18+
GGML_COMMIT = 0
19+
1720
#CXX = g++
1821
#CXX = clang++
1922

@@ -93,6 +96,8 @@ ifdef LLAMA_SHARED
9396
FLAG_S = shared
9497
endif
9598

99+
VERSIONS = -DGGML_VERSION -DGGML_COMMIT
100+
96101
ifdef AVX
97102
ARCH = -march=core-avx-i -mtune=core-avx-i -mavx
98103
ARCH_NAME = _AVX
@@ -117,7 +122,8 @@ PREFIX_A = master
117122
# ggml
118123
# ggmlsrc_f_h = $(base)/ggml
119124
# ggmlsrc_f_s = $(base)/ggml
120-
ggmlsrc_f = $(base)/$(PREFIX_A)/ggml
125+
ggmlsrc = $(base)/$(PREFIX_A)
126+
ggmlsrc_f = $(ggmlsrc)/ggml
121127
ggmlsrc_f_h = $(ggmlsrc_f)/include
122128
ggmlsrc_f_s = $(ggmlsrc_f)/src
123129
# backends
@@ -260,7 +266,7 @@ C_WARNS = -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Wer
260266
CPP_WARNS = -Wshadow -Wmissing-declarations -Wmissing-noreturn
261267

262268
#for main ui
263-
CXXFLAGS_UI += $(OPT_UI) -std=$(CCPP) -fPIC -DNDEBUG $(ARCH) -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS -w
269+
CXXFLAGS_UI += $(OPT_UI) -std=$(CCPP) -fPIC -DNDEBUG $(ARCH) $(VERSIONS) -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS -w
264270
CXXFLAGS_UI += -I$(IMGUI_DIR) -I$(IMGUI_DIR)/backends
265271
CXXFLAGS_UI += -g -Wall -Wformat -pipe
266272

@@ -269,10 +275,10 @@ CXXFLAGS_UI += -DSDL2
269275
endif
270276

271277
#for general ggml-gguf
272-
CFLAGS = $(I_GGUF) $(OPTC) -std=$(CCC) -fPIC $(GNUPDATEC) -DNDEBUG $(ARCH) -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS -w $(WARNS) $(C_WARNS) -pipe
278+
CFLAGS = $(I_GGUF) $(OPTC) -std=$(CCC) -fPIC $(GNUPDATEC) -DNDEBUG $(ARCH) $(VERSIONS) -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS -w $(WARNS) $(C_WARNS) -pipe
273279

274280
#for all chatTest
275-
CXXFLAGS = $(I_GGUF) $(OPT) -std=$(CCPP) $(GNUPDATECXX) -fPIC -DNDEBUG $(ARCH) -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS -w $(WARNS) $(CPP_WARNS) -pipe
281+
CXXFLAGS = $(I_GGUF) $(OPT) -std=$(CCPP) $(GNUPDATECXX) -fPIC -DNDEBUG $(ARCH) $(VERSIONS) -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS -w $(WARNS) $(CPP_WARNS) -pipe
276282

277283
# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
278284
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
@@ -839,6 +845,10 @@ chatTest_vk:
839845
$(MAKE) chatTest VULKAN=1
840846
@echo Using VULKAN
841847

848+
test-vk-ops:
849+
$(MAKE) test-backend-ops VULKAN=1
850+
@echo Using VULKAN
851+
842852
# aggregates
843853

844854
chats: chat chat_ob chat_cl chat_vk
@@ -877,6 +887,10 @@ $(chatTest_cpu):$(TMP)$(PREFIX)_class_chat.o $(OBJS_GGUF)
877887
@echo ARCH = $(ARCH)
878888
$(CXX) $(I_GGUF) $(filter-out %.h,$^) $(LDFLAGS) -o $@ $(CXXFLAGS)
879889

890+
test-backend-ops:$(ggmlsrc)/tests/test-backend-ops.cpp $(OBJS_GGUF)
891+
@echo ARCH = $(ARCH)
892+
$(CXX) $(I_GGUF) $(filter-out %.h,$^) $(LDFLAGS) -o $@ $(CXXFLAGS)
893+
880894
dualTest:$(TMP)$(PREFIX)_dual_chat.o $(OBJS_GGUF)
881895
$(CXX) $(I_GGUF) $(CXXFLAGS) $(filter-out %.h,$^) $(LDFLAGS) -o $@
882896

@@ -894,7 +908,11 @@ $(EXE_VK)_mini:$(MAIN_CPP) llama_chat1.res $(OBJS) $(OBJS_VK)
894908

895909
$(chatTest_vk):$(conapp) $(OBJS_VK)
896910
#$(CXX) $(I_GGUF) $(CXXFLAGS_VK) $(filter-out %.h,$^) $(LDFLAGS_VK) $(LDFLAGS_VK+) -o $@
897-
$(CXX) $(I_GGUF) $(CXXFLAGS_VK) -c $< -o $(call GET_OBJ_FILE1, $<)
898-
$(CXX) $(I_GGUF) $(CXXFLAGS_VK) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE1, $<) -o $@ $(LDFLAGS_VK) $(LDFLAGS_VK+)
911+
$(CXX) $(I_GGUF) $(CXXFLAGS_VK) -c $< -o $(call GET_OBJ_FILE1, $<)
912+
$(CXX) $(I_GGUF) $(CXXFLAGS_VK) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE1, $<) -o $@ $(LDFLAGS_VK) $(LDFLAGS_VK+)
899913
#-
900914

915+
$(test-vk-ops):$(ggmlsrc)/tests/test-backend-ops.cpp $(OBJS_VK)
916+
$(CXX) $(I_GGUF) $(CXXFLAGS_VK) -c $< -o $(call GET_OBJ_FILE1, $<)
917+
$(CXX) $(I_GGUF) $(CXXFLAGS_VK) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE1, $<) -o $@ $(LDFLAGS_VK) $(LDFLAGS_VK+)
918+

base_sampling2/common.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1378,6 +1378,11 @@ std::vector<llama_token> common_tokenize(
13781378
int n_tokens = text.length() + 2 * add_special;
13791379
std::vector<llama_token> result(n_tokens);
13801380
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1381+
1382+
if (n_tokens == std::numeric_limits<int32_t>::min()) {
1383+
throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
1384+
}
1385+
13811386
if (n_tokens < 0) {
13821387
result.resize(-n_tokens);
13831388
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);

base_sampling2/llama-addon.cpp

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -681,8 +681,7 @@ static void llama_sampler_min_p_addon_apply(struct llama_sampler * smpl, llama_t
681681
if (!filtered_tokens.empty() && filtered_tokens.size() >= ctx->min_keep) {
682682
memcpy(cur_p->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
683683
cur_p->size = filtered_tokens.size();
684-
// Guard against a single choice
685-
if (cur_p->size < 2) cur_p->size = 2;
684+
// Cannot guard against a single choice here due to memcpy
686685
min_p_applied = true;
687686
}
688687
}
@@ -709,7 +708,7 @@ static void llama_sampler_min_p_addon_apply(struct llama_sampler * smpl, llama_t
709708
}
710709

711710
// Guard against a single choice
712-
if (i < 2) i = 2;
711+
if (i < ctx->min_keep) i = ctx->min_keep;
713712

714713
// Resize the output vector to keep only the matching tokens
715714
cur_p->size = i;
@@ -1123,13 +1122,15 @@ void llama_sample_p_step_addon_apply(struct llama_sampler * smpl, llama_token_da
11231122
step_found = true;
11241123
}
11251124

1126-
if (step_found && i >= ctx->min_keep) {
1125+
if (step_found) {
11271126
// Resize the output vector to keep only the tokens before the step
1128-
candidates->size = i;
1127+
if (i > ctx->min_keep) candidates->size = i;
1128+
else candidates->size = ctx->min_keep;
11291129

11301130
break;
11311131
}
11321132
}
1133+
11331134
p_step_total = candidates->size;
11341135
}
11351136

@@ -1661,8 +1662,9 @@ static void llama_sampler_tail_free_apply(struct llama_sampler * smpl, llama_tok
16611662
cum_sum += second_derivatives[i];
16621663

16631664
// Check if the running sum is greater than z or if we have kept at least min_keep tokens
1664-
if (cum_sum > ctx->z && i >= ctx->min_keep) {
1665-
last_idx = i;
1665+
if (cum_sum > ctx->z) {
1666+
if (i > ctx->min_keep) last_idx = i;
1667+
else last_idx = ctx->min_keep;
16661668
break;
16671669
}
16681670
}

base_sampling2/master/ggml/include/ggml-backend.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -339,7 +339,7 @@ extern "C" {
339339
typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
340340

341341
// Compare the output of two backends
342-
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
342+
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor * test_node);
343343

344344
// Tensor initialization
345345
GGML_API enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);

base_sampling2/master/ggml/include/ggml-cpu.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ extern "C" {
101101
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
102102
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
103103
GGML_BACKEND_API int ggml_cpu_has_vxe (void);
104+
GGML_BACKEND_API int ggml_cpu_has_nnpa (void);
104105
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
105106
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
106107

@@ -133,6 +134,7 @@ extern "C" {
133134

134135
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
135136

137+
GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);
136138
GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
137139
GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
138140
GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);

0 commit comments

Comments
 (0)