4
4
#include " llama-mmap.h"
5
5
#include " llama-model.h"
6
6
7
- #include < algorithm>
8
7
#include < map>
9
8
#include < cassert>
10
9
#include < stdexcept>
11
10
12
11
// vec
13
12
14
- struct ggml_tensor * llama_adapter_cvec::tensor_for (int il) const {
13
+ ggml_tensor * llama_adapter_cvec::tensor_for (int il) const {
15
14
if (il < 0 || il < layer_start || il > layer_end || (size_t ) il >= tensors.size ()) {
16
15
return nullptr ;
17
16
}
18
17
19
18
return tensors[il];
20
19
}
21
20
22
- struct ggml_tensor * llama_adapter_cvec::apply_to (struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
21
+ ggml_tensor * llama_adapter_cvec::apply_to (ggml_context * ctx, ggml_tensor * cur, int il) const {
23
22
ggml_tensor * layer_dir = tensor_for (il);
24
23
if (layer_dir != nullptr ) {
25
24
cur = ggml_add (ctx, cur, layer_dir);
@@ -40,7 +39,7 @@ bool llama_adapter_cvec::init(const llama_model & model) {
40
39
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
41
40
auto it = ctx_map.find (buft);
42
41
if (it == ctx_map.end ()) {
43
- struct ggml_init_params params = {
42
+ ggml_init_params params = {
44
43
/* .mem_size =*/ hparams.n_layer *ggml_tensor_overhead (),
45
44
/* .mem_buffer =*/ NULL ,
46
45
/* .no_alloc =*/ true ,
@@ -91,7 +90,7 @@ bool llama_adapter_cvec::init(const llama_model & model) {
91
90
return true ;
92
91
}
93
92
94
- int32_t llama_adapter_cvec::apply (
93
+ bool llama_adapter_cvec::apply (
95
94
const llama_model & model,
96
95
const float * data,
97
96
size_t len,
@@ -104,17 +103,17 @@ int32_t llama_adapter_cvec::apply(
104
103
// disable the current control vector (but leave allocated for later)
105
104
layer_start = -1 ;
106
105
layer_end = -1 ;
107
- return 0 ;
106
+ return true ;
108
107
}
109
108
110
109
if (n_embd != (int ) hparams.n_embd ) {
111
110
LLAMA_LOG_ERROR (" %s: control vector n_embd does not match model\n " , __func__);
112
- return 1 ;
111
+ return false ;
113
112
}
114
113
115
114
if (tensors.empty ()) {
116
115
if (!init (model)) {
117
- return 1 ;
116
+ return false ;
118
117
}
119
118
}
120
119
@@ -130,12 +129,12 @@ int32_t llama_adapter_cvec::apply(
130
129
}
131
130
}
132
131
133
- return 0 ;
132
+ return true ;
134
133
}
135
134
136
135
// lora
137
136
138
- llama_adapter_lora_weight * llama_adapter_lora::get_weight (struct ggml_tensor * w) {
137
+ llama_adapter_lora_weight * llama_adapter_lora::get_weight (ggml_tensor * w) {
139
138
const std::string name (w->name );
140
139
141
140
const auto pos = ab_map.find (name);
@@ -146,11 +145,11 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor *
146
145
return nullptr ;
147
146
}
148
147
149
- static void llama_adapter_lora_init_impl (struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) {
148
+ static void llama_adapter_lora_init_impl (llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
150
149
LLAMA_LOG_INFO (" %s: loading lora adapter from '%s' ...\n " , __func__, path_lora);
151
150
152
151
ggml_context * ctx_init;
153
- struct gguf_init_params meta_gguf_params = {
152
+ gguf_init_params meta_gguf_params = {
154
153
/* .no_alloc = */ true ,
155
154
/* .ctx = */ &ctx_init,
156
155
};
@@ -201,7 +200,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
201
200
auto it = ctx_map.find (buft);
202
201
if (it == ctx_map.end ()) {
203
202
// add a new context
204
- struct ggml_init_params params = {
203
+ ggml_init_params params = {
205
204
/* .mem_size =*/ n_tensors*ggml_tensor_overhead (),
206
205
/* .mem_buffer =*/ NULL ,
207
206
/* .no_alloc =*/ true ,
@@ -248,6 +247,26 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
248
247
}
249
248
}
250
249
250
+ // get extra buffer types of the CPU
251
+ // TODO: a more general solution for non-CPU extra buft should be imlpemented in the future
252
+ // ref: https://github.com/ggml-org/llama.cpp/pull/12593#pullrequestreview-2718659948
253
+ std::vector<ggml_backend_buffer_type_t > buft_extra;
254
+ {
255
+ auto * cpu_dev = ggml_backend_dev_by_type (GGML_BACKEND_DEVICE_TYPE_CPU);
256
+ auto * cpu_reg = ggml_backend_dev_backend_reg (cpu_dev);
257
+
258
+ auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t )
259
+ ggml_backend_reg_get_proc_address (cpu_reg, " ggml_backend_dev_get_extra_bufts" );
260
+
261
+ if (ggml_backend_dev_get_extra_bufts_fn) {
262
+ ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn (cpu_dev);
263
+ while (extra_bufts && *extra_bufts) {
264
+ buft_extra.emplace_back (*extra_bufts);
265
+ ++extra_bufts;
266
+ }
267
+ }
268
+ }
269
+
251
270
// add tensors
252
271
for (auto & it : ab_map) {
253
272
const std::string & name = it.first ;
@@ -264,7 +283,23 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
264
283
throw std::runtime_error (" LoRA tensor '" + name + " ' does not exist in base model (hint: maybe wrong base model?)" );
265
284
}
266
285
267
- struct ggml_context * dev_ctx = ctx_for_buft (ggml_backend_buffer_get_type (model_tensor->buffer ));
286
+ auto * buft = ggml_backend_buffer_get_type (model_tensor->buffer );
287
+
288
+ // do not load loras to extra buffer types (i.e. bufts for repacking) -> use the CPU in that case
289
+ for (auto & ex : buft_extra) {
290
+ if (ex == buft) {
291
+ LLAMA_LOG_WARN (" %s: lora for '%s' cannot use buft '%s', fallback to CPU\n " , __func__, model_tensor->name , ggml_backend_buft_name (buft));
292
+
293
+ auto * cpu_dev = ggml_backend_dev_by_type (GGML_BACKEND_DEVICE_TYPE_CPU);
294
+ buft = ggml_backend_dev_buffer_type (cpu_dev);
295
+
296
+ break ;
297
+ }
298
+ }
299
+
300
+ LLAMA_LOG_DEBUG (" %s: lora for '%s' -> '%s'\n " , __func__, model_tensor->name , ggml_backend_buft_name (buft));
301
+
302
+ ggml_context * dev_ctx = ctx_for_buft (buft);
268
303
// validate tensor shape
269
304
if (is_token_embd) {
270
305
// expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
@@ -281,8 +316,8 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
281
316
}
282
317
283
318
// save tensor to adapter
284
- struct ggml_tensor * tensor_a = ggml_dup_tensor (dev_ctx, w.a );
285
- struct ggml_tensor * tensor_b = ggml_dup_tensor (dev_ctx, w.b );
319
+ ggml_tensor * tensor_a = ggml_dup_tensor (dev_ctx, w.a );
320
+ ggml_tensor * tensor_b = ggml_dup_tensor (dev_ctx, w.b );
286
321
ggml_set_name (tensor_a, w.a ->name );
287
322
ggml_set_name (tensor_b, w.b ->name );
288
323
adapter.ab_map [name] = llama_adapter_lora_weight (tensor_a, tensor_b);
@@ -308,7 +343,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
308
343
{
309
344
llama_file gguf_file (path_lora, " rb" );
310
345
std::vector<uint8_t > read_buf;
311
- auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) {
346
+ auto set_tensor = [&](ggml_tensor * orig, ggml_tensor * dev) {
312
347
size_t offs = gguf_get_data_offset (ctx_gguf.get ()) + gguf_get_tensor_offset (ctx_gguf.get (), gguf_find_tensor (ctx_gguf.get (), orig->name ));
313
348
size_t size = ggml_nbytes (orig);
314
349
read_buf.resize (size);
@@ -327,8 +362,8 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
327
362
LLAMA_LOG_INFO (" %s: loaded %zu tensors from lora file\n " , __func__, adapter.ab_map .size ()*2 );
328
363
}
329
364
330
- struct llama_adapter_lora * llama_adapter_lora_init (struct llama_model * model, const char * path_lora) {
331
- struct llama_adapter_lora * adapter = new llama_adapter_lora ();
365
+ llama_adapter_lora * llama_adapter_lora_init (llama_model * model, const char * path_lora) {
366
+ llama_adapter_lora * adapter = new llama_adapter_lora ();
332
367
333
368
try {
334
369
llama_adapter_lora_init_impl (*model, path_lora, *adapter);
@@ -342,6 +377,6 @@ struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model,
342
377
return nullptr ;
343
378
}
344
379
345
- void llama_adapter_lora_free (struct llama_adapter_lora * adapter) {
380
+ void llama_adapter_lora_free (llama_adapter_lora * adapter) {
346
381
delete adapter;
347
382
}
0 commit comments