1
1
// Defines fileno on msys:
2
2
#ifndef _GNU_SOURCE
3
3
#define _GNU_SOURCE
4
+ #include < cstdint>
5
+ #include < cstdio>
4
6
#endif
5
7
6
8
#include " llama_util.h"
@@ -1758,8 +1760,7 @@ int llama_model_quantize(
1758
1760
}
1759
1761
}
1760
1762
1761
- int llama_apply_lora_from_file (struct llama_context * ctx, const char * path_lora, int n_threads) {
1762
- // TODO: refactor all of this after PR #801
1763
+ int llama_apply_lora_from_file_internal (struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
1763
1764
fprintf (stderr, " %s: applying lora adapter from '%s' - please wait ...\n " , __func__, path_lora);
1764
1765
1765
1766
auto & model = ctx->model ;
@@ -1800,13 +1801,13 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
1800
1801
1801
1802
// create a temporary ggml context to store the lora tensors
1802
1803
// todo: calculate size from biggest possible tensor
1803
- std::vector<uint8_t > buf (1024ull * 1024ull * 1024ull );
1804
+ std::vector<uint8_t > lora_buf (1024ull * 1024ull * 1024ull );
1804
1805
struct ggml_init_params params;
1805
- params.mem_size = buf .size ();
1806
- params.mem_buffer = buf .data ();
1806
+ params.mem_size = lora_buf .size ();
1807
+ params.mem_buffer = lora_buf .data ();
1807
1808
params.no_alloc = false ;
1808
1809
1809
- ggml_context* lora_ctx = ggml_init (params);
1810
+ ggml_context * lora_ctx = ggml_init (params);
1810
1811
std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
1811
1812
1812
1813
// create a name -> tensor map of the model to accelerate lookups
@@ -1815,6 +1816,32 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
1815
1816
model_tensors.insert (kv);
1816
1817
}
1817
1818
1819
+
1820
+ // load base model
1821
+ std::unique_ptr<llama_model_loader> model_loader;
1822
+ ggml_context * base_ctx = NULL ;
1823
+ llama_buffer base_buf;
1824
+ if (path_base_model) {
1825
+ fprintf (stderr, " %s: loading base model from '%s'\n " , __func__, path_base_model);
1826
+ model_loader.reset (new llama_model_loader (path_base_model, /* use_mmap*/ true , /* vocab_only*/ false ));
1827
+
1828
+ size_t ctx_size, mmapped_size;
1829
+ model_loader->calc_sizes (&ctx_size, &mmapped_size);
1830
+ base_buf.resize (ctx_size);
1831
+
1832
+ ggml_init_params base_params;
1833
+ base_params.mem_size = base_buf.size ;
1834
+ base_params.mem_buffer = base_buf.addr ;
1835
+ base_params.no_alloc = model_loader->use_mmap ;
1836
+
1837
+ base_ctx = ggml_init (base_params);
1838
+
1839
+ model_loader->ggml_ctx = base_ctx;
1840
+
1841
+ // maybe this should in llama_model_loader
1842
+ model_loader->mapping .reset (new llama_mmap (&model_loader->file_loaders .at (0 )->file , false ));
1843
+ }
1844
+
1818
1845
fprintf (stderr, " %s: " , __func__);
1819
1846
1820
1847
// read tensors and apply
@@ -1891,13 +1918,31 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
1891
1918
if (lora_tensors.find (base_name + " .loraA" ) != lora_tensors.end () &&
1892
1919
lora_tensors.find (base_name + " .loraB" ) != lora_tensors.end ()) {
1893
1920
1894
- ggml_tensor * tensor = model_tensors[base_name];
1921
+ ggml_tensor * dest_t = model_tensors[base_name];
1922
+ ggml_tensor * base_t ;
1923
+ if (model_loader) {
1924
+ // load from base model
1925
+ if (model_loader->tensors_map .name_to_idx .find (base_name) == model_loader->tensors_map .name_to_idx .end ()) {
1926
+ fprintf (stderr, " %s: error: tensor '%s' not found in base model\n " , __func__, base_name.c_str ());
1927
+ return 1 ;
1928
+ }
1929
+ size_t idx = model_loader->tensors_map .name_to_idx [base_name];
1930
+ llama_load_tensor & lt = model_loader->tensors_map .tensors [idx];
1931
+ base_t = model_loader->get_tensor (base_name, { (uint32_t )dest_t ->ne [0 ], (uint32_t )dest_t ->ne [1 ] });
1932
+ lt.data = (uint8_t *) lt.ggml_tensor ->data ;
1933
+ model_loader->load_data_for (lt);
1934
+ lt.ggml_tensor ->data = lt.data ;
1935
+ }
1936
+ else {
1937
+ base_t = dest_t ;
1938
+ }
1939
+
1895
1940
ggml_tensor * loraA = lora_tensors[base_name + " .loraA" ];
1896
1941
ggml_tensor * loraB = lora_tensors[base_name + " .loraB" ];
1897
1942
1898
- if (tensor ->ne [0 ] != loraA->ne [1 ] || tensor ->ne [1 ] != loraB->ne [1 ]) {
1943
+ if (base_t ->ne [0 ] != loraA->ne [1 ] || base_t ->ne [1 ] != loraB->ne [1 ]) {
1899
1944
fprintf (stderr, " %s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 " );"
1900
- " are you sure that this adapter is for this model?\n " , __func__, tensor ->ne [0 ], loraA->ne [1 ]);
1945
+ " are you sure that this adapter is for this model?\n " , __func__, base_t ->ne [0 ], loraA->ne [1 ]);
1901
1946
return 1 ;
1902
1947
}
1903
1948
@@ -1909,14 +1954,14 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
1909
1954
BA = ggml_scale (lora_ctx, BA, scale_tensor);
1910
1955
}
1911
1956
1912
- // printf("%s: (B)(%d %d %d %d) x (A)(%d %d %d %d) => (BA)(%d %d %d %d) + (T)(%d %d %d %d)\n",
1913
- // base_name.c_str(),
1914
- // (int)loraB->ne[0], (int)loraB->ne[1], (int)loraB->ne[2], (int)loraB->ne[3],
1915
- // (int)loraA->ne[0], (int)loraA->ne[1], (int)loraA->ne[2], (int)loraA->ne[3],
1916
- // (int)BA->ne[0], (int)BA->ne[1], (int)BA->ne[2], (int)BA->ne[3],
1917
- // (int)tensor->ne[0], (int)tensor->ne[1], (int)tensor->ne[2], (int)tensor->ne[3]
1918
- // );
1919
- ggml_tensor * r = ggml_add_inplace (lora_ctx, tensor, BA);
1957
+ ggml_tensor * r;
1958
+ if ( base_t == dest_t ) {
1959
+ r = ggml_add_inplace (lora_ctx, dest_t , BA);
1960
+ }
1961
+ else {
1962
+ r = ggml_add (lora_ctx, base_t , BA);
1963
+ r = ggml_cpy (lora_ctx, r, dest_t );
1964
+ }
1920
1965
1921
1966
struct ggml_cgraph gf = ggml_build_forward (r);
1922
1967
gf.n_threads = n_threads;
@@ -1933,14 +1978,27 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
1933
1978
}
1934
1979
}
1935
1980
1981
+ // TODO: this should be in a destructor, it will leak on failure
1936
1982
ggml_free (lora_ctx);
1983
+ if (base_ctx) {
1984
+ ggml_free (base_ctx);
1985
+ }
1937
1986
1938
1987
const int64_t t_lora_us = ggml_time_us () - t_start_lora_us;
1939
1988
fprintf (stderr, " done (%.2f ms)\n " , t_lora_us / 1000.0 );
1940
1989
1941
1990
return 0 ;
1942
1991
}
1943
1992
1993
+ int llama_apply_lora_from_file (struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
1994
+ try {
1995
+ return llama_apply_lora_from_file_internal (ctx, path_lora, path_base_model, n_threads);
1996
+ } catch (const std::string & err) {
1997
+ fprintf (stderr, " %s: failed to apply lora adapter: %s\n " , __func__, err.c_str ());
1998
+ return 1 ;
1999
+ }
2000
+ }
2001
+
1944
2002
// Returns the KV cache that will contain the context for the
1945
2003
// ongoing prediction with the model.
1946
2004
const uint8_t * llama_get_kv_cache (struct llama_context * ctx) {
0 commit comments