Skip to content

Commit 92afdfc

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .github/labeler.yml # .github/workflows/server.yml # .gitignore # CMakeLists.txt # Makefile # README-sycl.md # README.md # llama.cpp # requirements/requirements-convert-hf-to-gguf-update.txt # requirements/requirements-convert-hf-to-gguf.txt # requirements/requirements-convert-legacy-llama.txt # scripts/sync-ggml.last # tests/test-tokenizer-random.py
2 parents 1339847 + 557b653 commit 92afdfc

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+11378
-9705
lines changed

CMakePresets.json

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,21 @@
1111
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
1212
}
1313
},
14-
14+
{
15+
"name": "sycl-base",
16+
"hidden": true,
17+
"generator": "Ninja",
18+
"binaryDir": "${sourceDir}/build-${presetName}",
19+
"cacheVariables": {
20+
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
21+
"CMAKE_CXX_COMPILER": "icx",
22+
"LLAMA_SYCL": "ON",
23+
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
24+
}
25+
},
1526
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
16-
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
27+
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
28+
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
1729
{ "name": "static", "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
1830

1931
{
@@ -35,15 +47,18 @@
3547
},
3648

3749
{ "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
38-
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "release" ] },
39-
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "release", "static" ] },
50+
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
51+
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] },
4052

4153
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
42-
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] },
43-
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] },
54+
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] },
55+
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] },
4456

4557
{ "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] },
46-
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "release" ] },
47-
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "release", "static" ] }
58+
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
59+
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
60+
61+
{ "name": "x64-windows-sycl-debug" , "inherits": [ "sycl-base", "debug" ] },
62+
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] }
4863
]
4964
}

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ default: koboldcpp_default koboldcpp_failsafe koboldcpp_openblas koboldcpp_noavx
55
tools: quantize_gpt2 quantize_gptj quantize_gguf quantize_neox quantize_mpt quantize_clip whispermain sdmain gguf-split
66
dev: koboldcpp_openblas
77
dev2: koboldcpp_clblast
8-
8+
dev3: koboldcpp_vulkan
99

1010
ifndef UNAME_S
1111
UNAME_S := $(shell uname -s)
@@ -158,7 +158,7 @@ OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instanc
158158

159159
ifdef LLAMA_CUBLAS
160160
CUBLAS_FLAGS = -DGGML_USE_CUDA -DSD_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
161-
CUBLASLD_FLAGS = -lcuda -lcublas -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/local/cuda/targets/sbsa-linux/lib -L/usr/lib/wsl/lib
161+
CUBLASLD_FLAGS = -lcuda -lcublas -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/local/cuda/targets/sbsa-linux/lib -L/usr/lib/wsl/lib
162162
CUBLAS_OBJS = ggml-cuda.o ggml_v3-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
163163
CUBLAS_OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
164164
CUBLAS_OBJS += $(OBJS_CUDA_TEMP_INST)

common/common.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
#include "llama.h"
88

99
#include <algorithm>
10-
#include <cassert>
1110
#include <cinttypes>
1211
#include <cmath>
1312
#include <codecvt>
@@ -543,6 +542,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
543542
/**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
544543
else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
545544
else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
545+
else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
546546
else { invalid_param = true; }
547547
return true;
548548
}
@@ -1871,6 +1871,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
18711871

18721872
options.push_back({ "backend" });
18731873
options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" });
1874+
18741875
if (llama_supports_mlock()) {
18751876
options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" });
18761877
}
@@ -2658,7 +2659,14 @@ static bool llama_download_file(const std::string & url, const std::string & pat
26582659
}
26592660

26602661
// Set the output file
2661-
std::unique_ptr<FILE, decltype(&fclose)> outfile(fopen(path_temporary.c_str(), "wb"), fclose);
2662+
2663+
struct FILE_deleter {
2664+
void operator()(FILE * f) const {
2665+
fclose(f);
2666+
}
2667+
};
2668+
2669+
std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
26622670
if (!outfile) {
26632671
fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path.c_str());
26642672
return false;

common/common.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,6 @@ struct gpt_params {
6969
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
7070
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
7171
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
72-
int32_t n_beams = 0; // if non-zero then use beam search of given width.
7372
int32_t grp_attn_n = 1; // group-attention factor
7473
int32_t grp_attn_w = 512; // group-attention width
7574
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)

convert-hf-to-gguf-update.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -214,15 +214,15 @@ def get_vocab_base_pre(self, tokenizer) -> str:
214214
"""
215215

216216
convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
217-
convert_py = convert_py_pth.read_text()
217+
convert_py = convert_py_pth.read_text(encoding="utf-8")
218218
convert_py = re.sub(
219219
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
220220
lambda m: m.group(1) + src_func + m.group(3),
221221
convert_py,
222222
flags=re.DOTALL | re.MULTILINE,
223223
)
224224

225-
convert_py_pth.write_text(convert_py)
225+
convert_py_pth.write_text(convert_py, encoding="utf-8")
226226

227227
logger.info("+++ convert-hf-to-gguf.py was updated")
228228

examples/embedding/embedding.cpp

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,10 @@ static std::vector<std::string> split_lines(const std::string & s) {
1818
return lines;
1919
}
2020

21-
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
22-
for (size_t i = 0; i < tokens.size(); i++) {
23-
llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
21+
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
22+
size_t n_tokens = tokens.size();
23+
for (size_t i = 0; i < n_tokens; i++) {
24+
llama_batch_add(batch, tokens[i], i, { seq_id }, true);
2425
}
2526
}
2627

@@ -41,13 +42,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
4142

4243
// try to get sequence embeddings - supported only when pooling_type is not NONE
4344
const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
44-
if (embd == NULL) {
45-
embd = llama_get_embeddings_ith(ctx, i);
46-
if (embd == NULL) {
47-
fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
48-
continue;
49-
}
50-
}
45+
GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
5146

5247
float * out = output + batch.seq_id[i][0] * n_embd;
5348
//TODO: I would also add a parameter here to enable normalization or not.
@@ -98,6 +93,12 @@ int main(int argc, char ** argv) {
9893
const int n_ctx_train = llama_n_ctx_train(model);
9994
const int n_ctx = llama_n_ctx(ctx);
10095

96+
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
97+
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
98+
fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
99+
return 1;
100+
}
101+
101102
if (n_ctx > n_ctx_train) {
102103
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
103104
__func__, n_ctx_train, n_ctx);

examples/gritlm/gritlm.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
4444

4545
// clear previous kv_cache values (irrelevant for embeddings)
4646
llama_kv_cache_clear(ctx);
47+
llama_set_embeddings(ctx, true);
4748
llama_set_causal_attn(ctx, false);
4849

4950
// run model
@@ -98,7 +99,9 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo
9899
llama_token eos_token = llama_token_eos(mdl);
99100

100101
llama_kv_cache_clear(ctx);
102+
llama_set_embeddings(ctx, false);
101103
llama_set_causal_attn(ctx, true);
104+
102105
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
103106

104107
std::vector<llama_token> inputs = llama_tokenize(mdl, prompt, false, true);
@@ -166,8 +169,7 @@ int main(int argc, char * argv[]) {
166169

167170
llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams);
168171

169-
// create new context - set to embedding mode
170-
cparams.embeddings = true;
172+
// create generation context
171173
llama_context * ctx = llama_new_context_with_model(mdl, cparams);
172174

173175
// ### Embedding/Representation ###

examples/infill/infill.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,11 @@ int main(int argc, char ** argv) {
224224
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
225225
embd_inp = inp_pfx;
226226
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
227-
embd_inp.push_back(llama_token_middle(model));
227+
228+
const llama_token middle_token = llama_token_middle(model);
229+
if (middle_token >= 0) {
230+
embd_inp.push_back(middle_token);
231+
}
228232

229233
LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
230234
LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
@@ -529,7 +533,12 @@ int main(int argc, char ** argv) {
529533
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
530534
embd_inp = inp_pfx;
531535
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
532-
embd_inp.push_back(llama_token_middle(model));
536+
537+
const llama_token middle_token = llama_token_middle(model);
538+
if (middle_token >= 0) {
539+
embd_inp.push_back(middle_token);
540+
}
541+
533542
embd.clear();
534543
n_remain = params.n_predict;
535544
n_past = 0;

examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -131,22 +131,29 @@ class LlamaState: ObservableObject {
131131

132132
messageLog += "\(text)"
133133

134-
while await llamaContext.n_cur < llamaContext.n_len {
135-
let result = await llamaContext.completion_loop()
136-
messageLog += "\(result)"
137-
}
134+
Task.detached {
135+
while await llamaContext.n_cur < llamaContext.n_len {
136+
let result = await llamaContext.completion_loop()
137+
await MainActor.run {
138+
self.messageLog += "\(result)"
139+
}
140+
}
138141

139-
let t_end = DispatchTime.now().uptimeNanoseconds
140-
let t_generation = Double(t_end - t_heat_end) / NS_PER_S
141-
let tokens_per_second = Double(await llamaContext.n_len) / t_generation
142+
let t_end = DispatchTime.now().uptimeNanoseconds
143+
let t_generation = Double(t_end - t_heat_end) / self.NS_PER_S
144+
let tokens_per_second = Double(await llamaContext.n_len) / t_generation
142145

143-
await llamaContext.clear()
144-
messageLog += """
145-
\n
146-
Done
147-
Heat up took \(t_heat)s
148-
Generated \(tokens_per_second) t/s\n
149-
"""
146+
await llamaContext.clear()
147+
148+
await MainActor.run {
149+
self.messageLog += """
150+
\n
151+
Done
152+
Heat up took \(t_heat)s
153+
Generated \(tokens_per_second) t/s\n
154+
"""
155+
}
156+
}
150157
}
151158

152159
func bench() async {

examples/retrieval/retrieval.cpp

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,9 +73,10 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
7373
return chunks;
7474
}
7575

76-
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
77-
for (size_t i = 0; i < tokens.size(); i++) {
78-
llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
76+
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
77+
size_t n_tokens = tokens.size();
78+
for (size_t i = 0; i < n_tokens; i++) {
79+
llama_batch_add(batch, tokens[i], i, { seq_id }, true);
7980
}
8081
}
8182

@@ -160,6 +161,12 @@ int main(int argc, char ** argv) {
160161
const int n_ctx_train = llama_n_ctx_train(model);
161162
const int n_ctx = llama_n_ctx(ctx);
162163

164+
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
165+
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
166+
fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
167+
return 1;
168+
}
169+
163170
if (n_ctx > n_ctx_train) {
164171
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
165172
__func__, n_ctx_train, n_ctx);

0 commit comments

Comments
 (0)