Skip to content

Commit f3c4239

Browse files
authored
talk-llama : sync llama.cpp (ggml-org#3084)
ggml-ci
1 parent 28dcdff commit f3c4239

36 files changed

+16655
-12115
lines changed

examples/talk-llama/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,12 @@ if (WHISPER_SDL2)
1212
llama-context.cpp
1313
llama-cparams.cpp
1414
llama-grammar.cpp
15+
llama-graph.cpp
1516
llama-hparams.cpp
1617
llama-impl.cpp
18+
llama-io.cpp
1719
llama-kv-cache.cpp
20+
llama-memory.cpp
1821
llama-mmap.cpp
1922
llama-model-loader.cpp
2023
llama-model.cpp

examples/talk-llama/llama-adapter.cpp

Lines changed: 55 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4,22 +4,21 @@
44
#include "llama-mmap.h"
55
#include "llama-model.h"
66

7-
#include <algorithm>
87
#include <map>
98
#include <cassert>
109
#include <stdexcept>
1110

1211
// vec
1312

14-
struct ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
13+
ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
1514
if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
1615
return nullptr;
1716
}
1817

1918
return tensors[il];
2019
}
2120

22-
struct ggml_tensor * llama_adapter_cvec::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
21+
ggml_tensor * llama_adapter_cvec::apply_to(ggml_context * ctx, ggml_tensor * cur, int il) const {
2322
ggml_tensor * layer_dir = tensor_for(il);
2423
if (layer_dir != nullptr) {
2524
cur = ggml_add(ctx, cur, layer_dir);
@@ -40,7 +39,7 @@ bool llama_adapter_cvec::init(const llama_model & model) {
4039
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
4140
auto it = ctx_map.find(buft);
4241
if (it == ctx_map.end()) {
43-
struct ggml_init_params params = {
42+
ggml_init_params params = {
4443
/*.mem_size =*/ hparams.n_layer*ggml_tensor_overhead(),
4544
/*.mem_buffer =*/ NULL,
4645
/*.no_alloc =*/ true,
@@ -91,7 +90,7 @@ bool llama_adapter_cvec::init(const llama_model & model) {
9190
return true;
9291
}
9392

94-
int32_t llama_adapter_cvec::apply(
93+
bool llama_adapter_cvec::apply(
9594
const llama_model & model,
9695
const float * data,
9796
size_t len,
@@ -104,17 +103,17 @@ int32_t llama_adapter_cvec::apply(
104103
// disable the current control vector (but leave allocated for later)
105104
layer_start = -1;
106105
layer_end = -1;
107-
return 0;
106+
return true;
108107
}
109108

110109
if (n_embd != (int) hparams.n_embd) {
111110
LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
112-
return 1;
111+
return false;
113112
}
114113

115114
if (tensors.empty()) {
116115
if (!init(model)) {
117-
return 1;
116+
return false;
118117
}
119118
}
120119

@@ -130,12 +129,12 @@ int32_t llama_adapter_cvec::apply(
130129
}
131130
}
132131

133-
return 0;
132+
return true;
134133
}
135134

136135
// lora
137136

138-
llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor * w) {
137+
llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) {
139138
const std::string name(w->name);
140139

141140
const auto pos = ab_map.find(name);
@@ -146,11 +145,11 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor *
146145
return nullptr;
147146
}
148147

149-
static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) {
148+
static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
150149
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
151150

152151
ggml_context * ctx_init;
153-
struct gguf_init_params meta_gguf_params = {
152+
gguf_init_params meta_gguf_params = {
154153
/* .no_alloc = */ true,
155154
/* .ctx = */ &ctx_init,
156155
};
@@ -201,7 +200,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
201200
auto it = ctx_map.find(buft);
202201
if (it == ctx_map.end()) {
203202
// add a new context
204-
struct ggml_init_params params = {
203+
ggml_init_params params = {
205204
/*.mem_size =*/ n_tensors*ggml_tensor_overhead(),
206205
/*.mem_buffer =*/ NULL,
207206
/*.no_alloc =*/ true,
@@ -248,6 +247,26 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
248247
}
249248
}
250249

250+
// get extra buffer types of the CPU
251+
// TODO: a more general solution for non-CPU extra buft should be imlpemented in the future
252+
// ref: https://github.com/ggml-org/llama.cpp/pull/12593#pullrequestreview-2718659948
253+
std::vector<ggml_backend_buffer_type_t> buft_extra;
254+
{
255+
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
256+
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
257+
258+
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
259+
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
260+
261+
if (ggml_backend_dev_get_extra_bufts_fn) {
262+
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
263+
while (extra_bufts && *extra_bufts) {
264+
buft_extra.emplace_back(*extra_bufts);
265+
++extra_bufts;
266+
}
267+
}
268+
}
269+
251270
// add tensors
252271
for (auto & it : ab_map) {
253272
const std::string & name = it.first;
@@ -264,7 +283,23 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
264283
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
265284
}
266285

267-
struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
286+
auto * buft = ggml_backend_buffer_get_type(model_tensor->buffer);
287+
288+
// do not load loras to extra buffer types (i.e. bufts for repacking) -> use the CPU in that case
289+
for (auto & ex : buft_extra) {
290+
if (ex == buft) {
291+
LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
292+
293+
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
294+
buft = ggml_backend_dev_buffer_type(cpu_dev);
295+
296+
break;
297+
}
298+
}
299+
300+
LLAMA_LOG_DEBUG("%s: lora for '%s' -> '%s'\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
301+
302+
ggml_context * dev_ctx = ctx_for_buft(buft);
268303
// validate tensor shape
269304
if (is_token_embd) {
270305
// expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
@@ -281,8 +316,8 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
281316
}
282317

283318
// save tensor to adapter
284-
struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
285-
struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
319+
ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
320+
ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
286321
ggml_set_name(tensor_a, w.a->name);
287322
ggml_set_name(tensor_b, w.b->name);
288323
adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
@@ -308,7 +343,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
308343
{
309344
llama_file gguf_file(path_lora, "rb");
310345
std::vector<uint8_t> read_buf;
311-
auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) {
346+
auto set_tensor = [&](ggml_tensor * orig, ggml_tensor * dev) {
312347
size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name));
313348
size_t size = ggml_nbytes(orig);
314349
read_buf.resize(size);
@@ -327,8 +362,8 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
327362
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
328363
}
329364

330-
struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora) {
331-
struct llama_adapter_lora * adapter = new llama_adapter_lora();
365+
llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
366+
llama_adapter_lora * adapter = new llama_adapter_lora();
332367

333368
try {
334369
llama_adapter_lora_init_impl(*model, path_lora, *adapter);
@@ -342,6 +377,6 @@ struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model,
342377
return nullptr;
343378
}
344379

345-
void llama_adapter_lora_free(struct llama_adapter_lora * adapter) {
380+
void llama_adapter_lora_free(llama_adapter_lora * adapter) {
346381
delete adapter;
347382
}

examples/talk-llama/llama-adapter.h

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,11 @@
1515
//
1616

1717
struct llama_adapter_cvec {
18-
struct ggml_tensor * tensor_for(int il) const;
18+
ggml_tensor * tensor_for(int il) const;
1919

20-
struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const;
20+
ggml_tensor * apply_to(ggml_context * ctx, ggml_tensor * cur, int il) const;
2121

22-
int32_t apply(
22+
bool apply(
2323
const llama_model & model,
2424
const float * data,
2525
size_t len,
@@ -36,16 +36,16 @@ struct llama_adapter_cvec {
3636
std::vector<ggml_context_ptr> ctxs;
3737
std::vector<ggml_backend_buffer_ptr> bufs;
3838

39-
std::vector<struct ggml_tensor *> tensors; // per layer
39+
std::vector<ggml_tensor *> tensors; // per layer
4040
};
4141

4242
//
4343
// llama_adapter_lora
4444
//
4545

4646
struct llama_adapter_lora_weight {
47-
struct ggml_tensor * a = nullptr;
48-
struct ggml_tensor * b = nullptr;
47+
ggml_tensor * a = nullptr;
48+
ggml_tensor * b = nullptr;
4949

5050
// get actual scale based on rank and alpha
5151
float get_scale(float alpha, float adapter_scale) const {
@@ -55,12 +55,12 @@ struct llama_adapter_lora_weight {
5555
}
5656

5757
llama_adapter_lora_weight() = default;
58-
llama_adapter_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
58+
llama_adapter_lora_weight(ggml_tensor * a, ggml_tensor * b) : a(a), b(b) {}
5959
};
6060

6161
struct llama_adapter_lora {
6262
// map tensor name to lora_a_b
63-
std::unordered_map<std::string, struct llama_adapter_lora_weight> ab_map;
63+
std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;
6464

6565
std::vector<ggml_context_ptr> ctxs;
6666
std::vector<ggml_backend_buffer_ptr> bufs;
@@ -70,5 +70,7 @@ struct llama_adapter_lora {
7070
llama_adapter_lora() = default;
7171
~llama_adapter_lora() = default;
7272

73-
llama_adapter_lora_weight * get_weight(struct ggml_tensor * w);
73+
llama_adapter_lora_weight * get_weight(ggml_tensor * w);
7474
};
75+
76+
using llama_adapter_loras = std::unordered_map<llama_adapter_lora *, float>;

0 commit comments

Comments
 (0)