@@ -1768,8 +1768,12 @@ int llama_model_quantize(
1768
1768
1769
1769
int llama_apply_lora_from_file (struct llama_context * ctx, const char * path_lora, int n_threads) {
1770
1770
// TODO: refactor all of this after PR #801
1771
+ fprintf (stderr, " %s: applying lora adapter from '%s' - please wait ...\n " , __func__, path_lora);
1772
+
1771
1773
auto & model = ctx->model ;
1772
1774
1775
+ const int64_t t_start_lora_us = ggml_time_us ();
1776
+
1773
1777
auto fin = std::ifstream (path_lora, std::ios::binary);
1774
1778
if (!fin) {
1775
1779
fprintf (stderr, " %s: failed to open '%s'\n " , __func__, path_lora);
@@ -1882,7 +1886,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
1882
1886
lora_tensors.find (base_name + " .loraB" ) != lora_tensors.end ()) {
1883
1887
1884
1888
ggml_tensor * tensor = model.tensors [base_name];
1885
- ggml_tensor * loraA = ggml_transpose (lora_ctx, lora_tensors[base_name + " .loraA" ]) ;
1889
+ ggml_tensor * loraA = lora_tensors[base_name + " .loraA" ];
1886
1890
ggml_tensor * loraB = lora_tensors[base_name + " .loraB" ];
1887
1891
1888
1892
if (tensor->ne [0 ] != loraA->ne [1 ]) {
@@ -1909,7 +1913,11 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
1909
1913
fprintf (stderr, " ." );
1910
1914
}
1911
1915
}
1912
- fprintf (stderr, " done\n " );
1916
+
1917
+ ggml_free (lora_ctx);
1918
+
1919
+ const int64_t t_lora_us = ggml_time_us () - t_start_lora_us;
1920
+ fprintf (stderr, " done (%.2f ms)\n " , t_lora_us / 1000.0 );
1913
1921
1914
1922
return 0 ;
1915
1923
}
0 commit comments