@@ -1752,8 +1752,12 @@ int llama_model_quantize(
1752
1752
1753
1753
int llama_apply_lora_from_file (struct llama_context * ctx, const char * path_lora, int n_threads) {
1754
1754
// TODO: refactor all of this after PR #801
1755
+ fprintf (stderr, " %s: applying lora adapter from '%s' - please wait ...\n " , __func__, path_lora);
1756
+
1755
1757
auto & model = ctx->model ;
1756
1758
1759
+ const int64_t t_start_lora_us = ggml_time_us ();
1760
+
1757
1761
auto fin = std::ifstream (path_lora, std::ios::binary);
1758
1762
if (!fin) {
1759
1763
fprintf (stderr, " %s: failed to open '%s'\n " , __func__, path_lora);
@@ -1866,7 +1870,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
1866
1870
lora_tensors.find (base_name + " .loraB" ) != lora_tensors.end ()) {
1867
1871
1868
1872
ggml_tensor * tensor = model.tensors [base_name];
1869
- ggml_tensor * loraA = ggml_transpose (lora_ctx, lora_tensors[base_name + " .loraA" ]) ;
1873
+ ggml_tensor * loraA = lora_tensors[base_name + " .loraA" ];
1870
1874
ggml_tensor * loraB = lora_tensors[base_name + " .loraB" ];
1871
1875
1872
1876
if (tensor->ne [0 ] != loraA->ne [1 ]) {
@@ -1893,7 +1897,11 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
1893
1897
fprintf (stderr, " ." );
1894
1898
}
1895
1899
}
1896
- fprintf (stderr, " done\n " );
1900
+
1901
+ ggml_free (lora_ctx);
1902
+
1903
+ const int64_t t_lora_us = ggml_time_us () - t_start_lora_us;
1904
+ fprintf (stderr, " done (%.2f ms)\n " , t_lora_us / 1000.0 );
1897
1905
1898
1906
return 0 ;
1899
1907
}
0 commit comments