Skip to content

Commit 4e9cf0a

Browse files
committed
Export lora A matrix pre-transposed
1 parent 3da7379 commit 4e9cf0a

File tree

2 files changed

+12
-2
lines changed

2 files changed

+12
-2
lines changed

convert-lora-to-ggml.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,8 @@ def write_tensor_header(self, name: str, shape: Sequence[int], data_type: 1) ->
9494
# since ggml doesn't always support other types for the second operand,
9595
# the tensors are always converted and exported as f32
9696
t = v.float().numpy()
97+
if "lora_A" in k:
98+
t = t.T
9799
print(f"{k} => {translate_tensor_name(k)} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
98100
write_tensor_header(fout, translate_tensor_name(k), t.shape, t.dtype)
99101
t.tofile(fout)

llama.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1759,8 +1759,12 @@ int llama_model_quantize(
17591759

17601760
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, int n_threads) {
17611761
// TODO: refactor all of this after PR #801
1762+
fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
1763+
17621764
auto & model = ctx->model;
17631765

1766+
const int64_t t_start_lora_us = ggml_time_us();
1767+
17641768
auto fin = std::ifstream(path_lora, std::ios::binary);
17651769
if (!fin) {
17661770
fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_lora);
@@ -1873,7 +1877,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
18731877
lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
18741878

18751879
ggml_tensor * tensor = model.tensors[base_name];
1876-
ggml_tensor * loraA = ggml_transpose(lora_ctx, lora_tensors[base_name + ".loraA"]);
1880+
ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
18771881
ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
18781882

18791883
if (tensor->ne[0] != loraA->ne[1]) {
@@ -1900,7 +1904,11 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
19001904
fprintf(stderr, ".");
19011905
}
19021906
}
1903-
fprintf(stderr, " done\n");
1907+
1908+
ggml_free(lora_ctx);
1909+
1910+
const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
1911+
fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0);
19041912

19051913
return 0;
19061914
}

0 commit comments

Comments
 (0)