Skip to content

Commit 877b4d0

Browse files
vgelggerganov
andauthored
llama : add support for control vectors (#5970)
* control vector api and implementation * control-vectors : minor code style updates * disable control vector when data == nullptr use -1 for disabled range (also on init) in case we ever support controlling layer 0 (embeddings) --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
1 parent 12247f4 commit 877b4d0

File tree

4 files changed

+392
-5
lines changed

4 files changed

+392
-5
lines changed

common/common.cpp

Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -568,6 +568,34 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
568568
break;
569569
}
570570
params.lora_base = argv[i];
571+
} else if (arg == "--control-vector") {
572+
if (++i >= argc) {
573+
invalid_param = true;
574+
break;
575+
}
576+
params.control_vectors.push_back({ 1.0f, argv[i], });
577+
} else if (arg == "--control-vector-scaled") {
578+
if (++i >= argc) {
579+
invalid_param = true;
580+
break;
581+
}
582+
const char * fname = argv[i];
583+
if (++i >= argc) {
584+
invalid_param = true;
585+
break;
586+
}
587+
params.control_vectors.push_back({ std::stof(argv[i]), fname, });
588+
} else if (arg == "--control-vector-layer-range") {
589+
if (++i >= argc) {
590+
invalid_param = true;
591+
break;
592+
}
593+
params.control_vector_layer_start = std::stoi(argv[i]);
594+
if (++i >= argc) {
595+
invalid_param = true;
596+
break;
597+
}
598+
params.control_vector_layer_end = std::stoi(argv[i]);
571599
} else if (arg == "--mmproj") {
572600
if (++i >= argc) {
573601
invalid_param = true;
@@ -1095,6 +1123,12 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
10951123
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
10961124
printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
10971125
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
1126+
printf(" --control-vector FNAME\n");
1127+
printf(" add a control vector\n");
1128+
printf(" --control-vector-scaled FNAME S\n");
1129+
printf(" add a control vector with user defined scaling S\n");
1130+
printf(" --control-vector-layer-range START END\n");
1131+
printf(" layer range to apply the control vector(s) to, start and end inclusive\n");
10981132
printf(" -m FNAME, --model FNAME\n");
10991133
printf(" model path (default: %s)\n", params.model.c_str());
11001134
printf(" -md FNAME, --model-draft FNAME\n");
@@ -1360,6 +1394,30 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
13601394
return std::make_tuple(nullptr, nullptr);
13611395
}
13621396

1397+
if (!params.control_vectors.empty()) {
1398+
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
1399+
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
1400+
1401+
const auto cvec = llama_control_vector_load(params.control_vectors);
1402+
if (cvec.n_embd == -1) {
1403+
llama_free(lctx);
1404+
llama_free_model(model);
1405+
return std::make_tuple(nullptr, nullptr);
1406+
}
1407+
1408+
int err = llama_control_vector_apply(lctx,
1409+
cvec.data.data(),
1410+
cvec.data.size(),
1411+
cvec.n_embd,
1412+
params.control_vector_layer_start,
1413+
params.control_vector_layer_end);
1414+
if (err) {
1415+
llama_free(lctx);
1416+
llama_free_model(model);
1417+
return std::make_tuple(nullptr, nullptr);
1418+
}
1419+
}
1420+
13631421
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
13641422
const std::string& lora_adapter = std::get<0>(params.lora_adapter[i]);
13651423
float lora_scale = std::get<1>(params.lora_adapter[i]);
@@ -1890,3 +1948,160 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
18901948

18911949
return sum / (sqrt(sum1) * sqrt(sum2));
18921950
}
1951+
1952+
//
1953+
// Control vector utils
1954+
//
1955+
1956+
static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) {
1957+
int32_t n_tensors;
1958+
1959+
size_t n_bytes = 0;
1960+
1961+
uint32_t max_direction_layer = 0;
1962+
1963+
llama_control_vector_data result = { -1, {} };
1964+
1965+
// calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer
1966+
{
1967+
struct ggml_init_params meta_params = {
1968+
/* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(),
1969+
/* .mem_buffer = */ nullptr,
1970+
/* .no_alloc = */ true,
1971+
};
1972+
ggml_context * meta_ctx = ggml_init(meta_params);
1973+
struct gguf_init_params meta_gguf_params = {
1974+
/* .no_alloc = */ true,
1975+
/* .ctx = */ &meta_ctx,
1976+
};
1977+
struct gguf_context * meta_ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
1978+
if (!meta_ctx_gguf) {
1979+
fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
1980+
ggml_free(meta_ctx);
1981+
return result;
1982+
}
1983+
1984+
n_tensors = gguf_get_n_tensors(meta_ctx_gguf);
1985+
for (int i = 0; i < n_tensors; i++) {
1986+
std::string name = gguf_get_tensor_name(meta_ctx_gguf, i);
1987+
1988+
// split on '.'
1989+
size_t dotpos = name.find('.');
1990+
if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
1991+
try {
1992+
uint32_t layer = std::stoi(name.substr(dotpos + 1));
1993+
if (layer == 0) {
1994+
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
1995+
ggml_free(meta_ctx);
1996+
gguf_free(meta_ctx_gguf);
1997+
return result;
1998+
}
1999+
if (layer > max_direction_layer) {
2000+
max_direction_layer = layer;
2001+
}
2002+
} catch (...) {
2003+
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
2004+
ggml_free(meta_ctx);
2005+
gguf_free(meta_ctx_gguf);
2006+
return result;
2007+
}
2008+
}
2009+
2010+
struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str());
2011+
if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) {
2012+
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
2013+
ggml_free(meta_ctx);
2014+
gguf_free(meta_ctx_gguf);
2015+
return result;
2016+
}
2017+
if (result.n_embd == -1) {
2018+
result.n_embd = ggml_nelements(tensor_meta);
2019+
} else if (ggml_nelements(tensor_meta) != result.n_embd) {
2020+
fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, load_info.fname.c_str());
2021+
ggml_free(meta_ctx);
2022+
gguf_free(meta_ctx_gguf);
2023+
return result;
2024+
}
2025+
n_bytes += ggml_nbytes(tensor_meta);
2026+
}
2027+
ggml_free(meta_ctx);
2028+
gguf_free(meta_ctx_gguf);
2029+
}
2030+
2031+
if (n_tensors == 0) {
2032+
fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
2033+
return result;
2034+
}
2035+
2036+
// load and scale tensors into final control vector context
2037+
struct ggml_init_params ggml_params = {
2038+
/* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes,
2039+
/* .mem_buffer = */ nullptr,
2040+
/* .no_alloc = */ false,
2041+
};
2042+
struct ggml_context * ctx = ggml_init(ggml_params);
2043+
2044+
struct gguf_init_params params = {
2045+
/*.no_alloc = */ false,
2046+
/*.ctx = */ &ctx,
2047+
};
2048+
struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), params);
2049+
if (!ctx_gguf) {
2050+
fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
2051+
ggml_free(ctx);
2052+
return result;
2053+
}
2054+
2055+
// do not store data for layer 0 (it's not used)
2056+
result.data.resize(result.n_embd * max_direction_layer);
2057+
2058+
for (uint32_t il = 1; il <= max_direction_layer; il++) {
2059+
const std::string name = "direction." + std::to_string(il);
2060+
const ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
2061+
2062+
float * dst = result.data.data() + result.n_embd * (il - 1);
2063+
2064+
if (tensor) {
2065+
const float * src = (const float *) tensor->data;
2066+
for (int j = 0; j < result.n_embd; j++) {
2067+
dst[j] = src[j] * load_info.strength;
2068+
}
2069+
} else {
2070+
for (int j = 0; j < result.n_embd; j++) {
2071+
dst[j] = 0.0f;
2072+
}
2073+
}
2074+
}
2075+
2076+
return result;
2077+
}
2078+
2079+
llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos) {
2080+
llama_control_vector_data result = { -1, {} };
2081+
2082+
for (const auto & info : load_infos) {
2083+
auto cur = llama_control_vector_load_one(info);
2084+
2085+
if (cur.n_embd == -1) {
2086+
return result;
2087+
}
2088+
if (result.n_embd != -1 && (result.n_embd != cur.n_embd || result.data.size() != cur.data.size())) {
2089+
fprintf(stderr, "%s: control vector in %s does not match previous vector dimensions\n", __func__, info.fname.c_str());
2090+
return result;
2091+
}
2092+
2093+
if (result.n_embd == -1) {
2094+
result = std::move(cur);
2095+
} else {
2096+
for (size_t i = 0; i < cur.data.size(); i++) {
2097+
result.data[i] += cur.data[i];
2098+
}
2099+
}
2100+
}
2101+
2102+
if (result.n_embd == -1) {
2103+
fprintf(stderr, "%s: no vectors passed\n", __func__);
2104+
}
2105+
2106+
return result;
2107+
}

common/common.h

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,13 @@ extern char const *LLAMA_COMMIT;
3737
extern char const *LLAMA_COMPILER;
3838
extern char const *LLAMA_BUILD_TARGET;
3939

40+
struct llama_control_vector_load_info;
41+
42+
int32_t get_num_physical_cores();
43+
4044
//
4145
// CLI argument parsing
4246
//
43-
int32_t get_num_physical_cores();
4447

4548
struct gpt_params {
4649
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
@@ -103,6 +106,11 @@ struct gpt_params {
103106
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
104107
std::string lora_base = ""; // base model path for the lora adapter
105108

109+
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
110+
111+
int32_t control_vector_layer_start = -1; // layer range for control vector
112+
int32_t control_vector_layer_end = -1; // layer range for control vector
113+
106114
int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
107115
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
108116
// (which is more convenient to use for plotting)
@@ -269,3 +277,24 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40
269277
void llama_embd_normalize(const float * inp, float * out, int n);
270278

271279
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
280+
281+
//
282+
// Control vector utils
283+
//
284+
285+
struct llama_control_vector_data {
286+
int n_embd;
287+
288+
// stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
289+
std::vector<float> data;
290+
};
291+
292+
struct llama_control_vector_load_info {
293+
float strength;
294+
295+
std::string fname;
296+
};
297+
298+
// Load control vectors, scale each by strength, and add them together.
299+
// On error, returns {-1, empty}
300+
llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);

0 commit comments

Comments
 (0)