Skip to content

Commit 418cdb4

Browse files
committed
convergence with NXSL 0.06
IQ2_K row meta size correction validate row data exclusions loosen gguf restrictions. IK_Llama.cpp quant strategies Custom quant strategies. IQ3_KS quant strategy.
1 parent d384cb6 commit 418cdb4

File tree

7 files changed

+538
-84
lines changed

7 files changed

+538
-84
lines changed

ggml/src/ggml-quants.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5410,8 +5410,9 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
54105410
return false;
54115411
}
54125412

5413-
if (nbytes % ggml_type_size(type) != 0) {
5414-
// if (type != GGML_TYPE_IQ2_TN && type != GGML_TYPE_IQ1_TN && type != GGML_TYPE_IQ4_KS && type != GGML_TYPE_IQ3_KS && nbytes % ggml_type_size(type) != 0) {
5413+
if (type != GGML_TYPE_IQ2_BN && type != GGML_TYPE_IQ1_BN
5414+
// && type != GGML_TYPE_IQ2_TN && type != GGML_TYPE_IQ1_TN &&
5415+
&& type != GGML_TYPE_IQ2_KS && type != GGML_TYPE_IQ4_KS && type != GGML_TYPE_IQ4_KSS && type != GGML_TYPE_IQ3_KS && type != GGML_TYPE_IQ5_KS && type != GGML_TYPE_IQ2_KT && type != GGML_TYPE_IQ3_KT && type != GGML_TYPE_IQ4_KT && nbytes % ggml_type_size(type) != 0) {
54155416
fprintf(stderr, "%s: invalid size %zu for type %s (type size = %zu)\n", __func__, nbytes, ggml_type_name(type), ggml_type_size(type));
54165417
return false;
54175418
}

ggml/src/ggml.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1397,7 +1397,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
13971397
// .vec_dot = vec_dot_iq2_k_q8_k,
13981398
// .vec_dot_type = GGML_TYPE_Q8_K,
13991399
// .nrows = 1,
1400-
.row_meta_size = 2,
1400+
.row_meta_size = 0,
14011401
},
14021402
[GGML_TYPE_IQ2_K_R4] = {
14031403
.type_name = "iq2_k_r4",

ggml/src/gguf.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -599,6 +599,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
599599
}
600600
const size_t type_size = ggml_type_size(info.t.type);
601601
const int64_t blck_size = ggml_blck_size(info.t.type);
602+
// const int64_t row_size = ggml_row_size(info.t.type, info.t.ne[0]);
602603

603604
// check that row size is divisible by block size
604605
if (blck_size == 0 || info.t.ne[0] % blck_size != 0) {
@@ -611,6 +612,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
611612

612613
// calculate byte offsets given the tensor shape and type
613614
info.t.nb[0] = type_size;
615+
// info.t.nb[1] = info.t.nb[0]*(info.t.ne[0]/blck_size);
614616
info.t.nb[1] = ggml_row_size(info.t.type, info.t.ne[0]);
615617
for (int j = 2; j < GGML_MAX_DIMS; ++j) {
616618
info.t.nb[j] = info.t.nb[j - 1]*info.t.ne[j - 1];
@@ -1162,7 +1164,9 @@ void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggm
11621164
GGML_ASSERT(tensor->ne[0] % blck_size == 0 && "tensor row size not divisible by block size of new type");
11631165

11641166
tensor->nb[0] = type_size;
1165-
tensor->nb[1] = tensor->nb[0]*(tensor->ne[0]/blck_size);
1167+
// tensor->nb[1] = tensor->nb[0]*(tensor->ne[0]/blck_size);
1168+
tensor->nb[1] = ggml_row_size(type, tensor->ne[0]);
1169+
11661170
for (int i = 2; i < GGML_MAX_DIMS; i++) {
11671171
tensor->nb[i] = tensor->nb[i - 1]*tensor->ne[i - 1];
11681172
}

include/llama.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -426,6 +426,14 @@ extern "C" {
426426
enum llama_ftype ftype; // quantize to this llama_ftype
427427
enum ggml_type output_tensor_type; // output tensor type
428428
enum ggml_type token_embedding_type; // token embeddings tensor type
429+
enum ggml_type attn_q_type; // attention query tensor type
430+
enum ggml_type attn_k_type; // attention key tensor type
431+
enum ggml_type attn_v_type; // attention value tensor type
432+
enum ggml_type attn_qkv_type; // attention query-key-value tensor type
433+
enum ggml_type attn_output_type; // attention output tensor type
434+
enum ggml_type ffn_gate_type; // feedforward network gate type
435+
enum ggml_type ffn_down_type; // feedforward network down type
436+
enum ggml_type ffn_up_type; // feedforward network up type
429437
bool allow_requantize; // allow quantizing non-f32/f16 tensors
430438
bool quantize_output_tensor; // quantize output.weight
431439
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored

src/llama-model-loader.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
6868
case LLAMA_FTYPE_MOSTLY_IQ2_KT: return "IQ2_KT - 2.125 bpw";
6969
case LLAMA_FTYPE_MOSTLY_IQ2_K: return "IQ2_K - 2.375 bpw";
7070
case LLAMA_FTYPE_MOSTLY_IQ3_K: return "IQ3_K - 3.4325 bpw";
71-
case LLAMA_FTYPE_MOSTLY_IQ3_KS: return "IQ3_K - 3.25 bpw";
71+
case LLAMA_FTYPE_MOSTLY_IQ3_KS: return "IQ3_KS - 3.25 bpw";
7272
case LLAMA_FTYPE_MOSTLY_IQ3_KL: return "IQ3_KL - 4 bpw";
7373
case LLAMA_FTYPE_MOSTLY_IQ3_KT: return "IQ3_KT - 3.125 bpw";
7474
case LLAMA_FTYPE_MOSTLY_IQ4_KT: return "IQ4_KT - 4.0 bpw";

0 commit comments

Comments
 (0)