Skip to content

Commit 14153b0

Browse files
committed
Support more layer types, fix memory and generation issues
1 parent 4d25b26 commit 14153b0

File tree

3 files changed

+36
-36
lines changed

3 files changed

+36
-36
lines changed

convert-lora-to-ggml.py

Lines changed: 25 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -44,18 +44,18 @@ class QuantizedDataType:
4444
}
4545

4646
HF_SUBLAYER_TO_GGML = {
47-
"self_attn.q_proj": "attention.wq.weight",
48-
"self_attn.k_proj": "attention.wk.weight",
49-
"self_attn.v_proj": "attention.wv.weight",
50-
"self_attn.o_proj": "attention.wo.weight",
51-
# "embed_tokens.weight": "tok_embeddings.weight",
52-
# "norm.weight": "norm.weight",
53-
# "lm_head.weight": "output.weight",
54-
# "mlp.gate_proj": "feed_forward.w1.weight",
55-
# "mlp.down_proj": "feed_forward.w2.weight",
56-
# "mlp.up_proj": "feed_forward.w3.weight",
57-
# "input_layernorm": "attention_norm.weight",
58-
# "post_attention_layernorm": "ffn_norm.weight",
47+
"self_attn.q_proj": "attention.wq",
48+
"self_attn.k_proj": "attention.wk",
49+
"self_attn.v_proj": "attention.wv",
50+
"self_attn.o_proj": "attention.wo",
51+
"mlp.gate_proj": "feed_forward.w1",
52+
"mlp.down_proj": "feed_forward.w2",
53+
"mlp.up_proj": "feed_forward.w3",
54+
"input_layernorm": "attention_norm",
55+
"post_attention_layernorm": "ffn_norm",
56+
# "norm": "norm",
57+
# "embed_tokens": "tok_embeddings",
58+
# "lm_head": "output",
5959
}
6060

6161

@@ -71,7 +71,9 @@ def translate_tensor_name(t):
7171
print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}")
7272
sys.exit(1)
7373

74-
output_string = f"layers.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.lora{lora_type}"
74+
output_string = (
75+
f"layers.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
76+
)
7577
return output_string
7678
else:
7779
print(f"Error: unrecognized tensor {t}")
@@ -138,16 +140,17 @@ def write_tensor_header(self, name: str, shape: Sequence[int], data_type: 1) ->
138140

139141
write_file_header(fout, params)
140142
for k, v in model.items():
141-
# since ggml doesn't always support other types for the second operand,
142-
# the tensors are always converted and exported as f32
143-
v = v.float()
143+
if k.endswith("lora_A.weight"):
144+
if v.dtype != torch.float16 and v.dtype != torch.float32:
145+
v = v.float()
146+
v = v.T
147+
else:
148+
v = v.float()
149+
144150
t = v.numpy()
145-
if "lora_A" in k:
146-
t = t.T
147-
print(
148-
f"{k} => {translate_tensor_name(k)} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB"
149-
)
150-
write_tensor_header(fout, translate_tensor_name(k), t.shape, t.dtype)
151+
tname = translate_tensor_name(k)
152+
print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
153+
write_tensor_header(fout, tname, t.shape, t.dtype)
151154
t.tofile(fout)
152155

153156
print(f"Converted {input_json} and {input_model} to {output_path}")

ggml.c

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5955,11 +5955,6 @@ static void ggml_compute_forward_add_q_f32(
59555955
GGML_ASSERT(nb1 <= nb2);
59565956
GGML_ASSERT(nb2 <= nb3);
59575957

5958-
GGML_ASSERT(ne0 == ne01);
5959-
GGML_ASSERT(ne1 == ne11);
5960-
GGML_ASSERT(ne2 == ne02);
5961-
GGML_ASSERT(ne3 == ne03);
5962-
59635958
GGML_ASSERT(src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1);
59645959
GGML_ASSERT(dst->type == src0->type);
59655960
GGML_ASSERT(src1->type == GGML_TYPE_F32);

llama.cpp

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -616,6 +616,7 @@ struct llama_model_loader {
616616
throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
617617
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
618618
}
619+
619620
return get_tensor_for(lt);
620621
}
621622

@@ -1798,7 +1799,8 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
17981799

17991800

18001801
// create a temporary ggml context to store the lora tensors
1801-
std::vector<uint8_t> buf(1024 * 1024 * 100);
1802+
// todo: calculate size from biggest possible tensor
1803+
std::vector<uint8_t> buf(1024ull * 1024ull * 1024ull);
18021804
struct ggml_init_params params;
18031805
params.mem_size = buf.size();
18041806
params.mem_buffer = buf.data();
@@ -1829,11 +1831,9 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
18291831
break;
18301832
}
18311833

1832-
int32_t nelements = 1;
18331834
int32_t ne[2] = { 1, 1 };
18341835
for (int i = 0; i < n_dims; ++i) {
18351836
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
1836-
nelements *= ne[i];
18371837
}
18381838

18391839
std::string name(length, 0);
@@ -1902,24 +1902,26 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
19021902
}
19031903

19041904
// w = w + BA*s
1905-
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraB, loraA);
1905+
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
19061906

19071907
if (scaling != 1.0f) {
19081908
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
19091909
BA = ggml_scale(lora_ctx, BA, scale_tensor);
19101910
}
19111911

1912+
//printf("%s: (B)(%d %d %d %d) x (A)(%d %d %d %d) => (BA)(%d %d %d %d) + (T)(%d %d %d %d)\n",
1913+
// base_name.c_str(),
1914+
// (int)loraB->ne[0], (int)loraB->ne[1], (int)loraB->ne[2], (int)loraB->ne[3],
1915+
// (int)loraA->ne[0], (int)loraA->ne[1], (int)loraA->ne[2], (int)loraA->ne[3],
1916+
// (int)BA->ne[0], (int)BA->ne[1], (int)BA->ne[2], (int)BA->ne[3],
1917+
// (int)tensor->ne[0], (int)tensor->ne[1], (int)tensor->ne[2], (int)tensor->ne[3]
1918+
//);
19121919
ggml_tensor * r = ggml_add_inplace(lora_ctx, tensor, BA);
1913-
//ggml_tensor * r = ggml_add(lora_ctx, tensor, BA);
1914-
//r = ggml_cpy(lora_ctx, r, tensor);
19151920

19161921
struct ggml_cgraph gf = ggml_build_forward(r);
19171922
gf.n_threads = n_threads;
19181923
ggml_graph_compute(lora_ctx, &gf);
19191924

1920-
// hack until ggml_cpy supports quantized tensors
1921-
// memcpy(tensor->data, r->data, ggml_nbytes(tensor));
1922-
19231925
// we won't need these tensors again, reset the context to save memory
19241926
ggml_free(lora_ctx);
19251927
lora_ctx = ggml_init(params);

0 commit comments

Comments
 (0)