convergence with NXSL 0.06

Nexesenex · Nexesenex · commit 418cdb4c97b1 · 2025-06-22T04:01:18.000+02:00
IQ2_K row meta size correction
validate row data exclusions
loosen gguf restrictions.
IK_Llama.cpp quant strategies
Custom quant strategies.
IQ3_KS quant strategy.
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
@@ -5410,8 +5410,9 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
         return false;
     }
 
-    if (nbytes % ggml_type_size(type) != 0) {
-    // if (type != GGML_TYPE_IQ2_TN && type != GGML_TYPE_IQ1_TN && type != GGML_TYPE_IQ4_KS && type != GGML_TYPE_IQ3_KS && nbytes % ggml_type_size(type) != 0) {
+    if (type != GGML_TYPE_IQ2_BN && type != GGML_TYPE_IQ1_BN
+        // && type != GGML_TYPE_IQ2_TN && type != GGML_TYPE_IQ1_TN &&
+        && type != GGML_TYPE_IQ2_KS && type != GGML_TYPE_IQ4_KS && type != GGML_TYPE_IQ4_KSS && type != GGML_TYPE_IQ3_KS && type != GGML_TYPE_IQ5_KS && type != GGML_TYPE_IQ2_KT && type != GGML_TYPE_IQ3_KT && type != GGML_TYPE_IQ4_KT && nbytes % ggml_type_size(type) != 0) {
         fprintf(stderr, "%s: invalid size %zu for type %s (type size = %zu)\n", __func__, nbytes, ggml_type_name(type), ggml_type_size(type));
         return false;
     }
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -1397,7 +1397,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         // .vec_dot                  = vec_dot_iq2_k_q8_k,
         // .vec_dot_type             = GGML_TYPE_Q8_K,
         // .nrows                    = 1,
-        .row_meta_size            = 2,
+        .row_meta_size            = 0,
     },
     [GGML_TYPE_IQ2_K_R4] = {
         .type_name                = "iq2_k_r4",
diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
@@ -599,6 +599,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
             }
             const size_t  type_size = ggml_type_size(info.t.type);
             const int64_t blck_size = ggml_blck_size(info.t.type);
+            // const int64_t row_size = ggml_row_size(info.t.type, info.t.ne[0]);
 
             // check that row size is divisible by block size
             if (blck_size == 0 || info.t.ne[0] % blck_size != 0) {
@@ -611,6 +612,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
 
             // calculate byte offsets given the tensor shape and type
             info.t.nb[0] = type_size;
+            // info.t.nb[1] = info.t.nb[0]*(info.t.ne[0]/blck_size);
             info.t.nb[1] = ggml_row_size(info.t.type, info.t.ne[0]);
             for (int j = 2; j < GGML_MAX_DIMS; ++j) {
                 info.t.nb[j] = info.t.nb[j - 1]*info.t.ne[j - 1];
@@ -1162,7 +1164,9 @@ void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggm
     GGML_ASSERT(tensor->ne[0] % blck_size == 0 && "tensor row size not divisible by block size of new type");
 
     tensor->nb[0] = type_size;
-    tensor->nb[1] = tensor->nb[0]*(tensor->ne[0]/blck_size);
+    // tensor->nb[1] = tensor->nb[0]*(tensor->ne[0]/blck_size);
+    tensor->nb[1] = ggml_row_size(type, tensor->ne[0]);
+
     for (int i = 2; i < GGML_MAX_DIMS; i++) {
         tensor->nb[i] = tensor->nb[i - 1]*tensor->ne[i - 1];
     }
diff --git a/include/llama.h b/include/llama.h
@@ -426,6 +426,14 @@ extern "C" {
         enum llama_ftype ftype;               // quantize to this llama_ftype
         enum ggml_type output_tensor_type;    // output tensor type
         enum ggml_type token_embedding_type;  // token embeddings tensor type
+        enum ggml_type attn_q_type;           // attention query tensor type
+        enum ggml_type attn_k_type;           // attention key tensor type
+        enum ggml_type attn_v_type;           // attention value tensor type
+        enum ggml_type attn_qkv_type;         // attention query-key-value tensor type
+        enum ggml_type attn_output_type;      // attention output tensor type
+        enum ggml_type ffn_gate_type;         // feedforward network gate type
+        enum ggml_type ffn_down_type;         // feedforward network down type
+        enum ggml_type ffn_up_type;           // feedforward network up type
         bool allow_requantize;                // allow quantizing non-f32/f16 tensors
         bool quantize_output_tensor;          // quantize output.weight
         bool only_copy;                       // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
@@ -68,7 +68,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
         case LLAMA_FTYPE_MOSTLY_IQ2_KT:   return "IQ2_KT - 2.125 bpw";
         case LLAMA_FTYPE_MOSTLY_IQ2_K:    return "IQ2_K - 2.375 bpw";
         case LLAMA_FTYPE_MOSTLY_IQ3_K:    return "IQ3_K - 3.4325 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ3_KS:   return "IQ3_K - 3.25 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ3_KS:   return "IQ3_KS - 3.25 bpw";
         case LLAMA_FTYPE_MOSTLY_IQ3_KL:   return "IQ3_KL - 4 bpw";
         case LLAMA_FTYPE_MOSTLY_IQ3_KT:   return "IQ3_KT - 3.125 bpw";
         case LLAMA_FTYPE_MOSTLY_IQ4_KT:   return "IQ4_KT - 4.0 bpw";
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp

Original file line number	Diff line number	Diff line change
`@@ -5410,8 +5410,9 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte`
`5410`	`5410`	`return false;`
`5411`	`5411`	`}`
`5412`	`5412`
`5413`		`- if (nbytes % ggml_type_size(type) != 0) {`
`5414`		`- // if (type != GGML_TYPE_IQ2_TN && type != GGML_TYPE_IQ1_TN && type != GGML_TYPE_IQ4_KS && type != GGML_TYPE_IQ3_KS && nbytes % ggml_type_size(type) != 0) {`
	`5413`	`+ if (type != GGML_TYPE_IQ2_BN && type != GGML_TYPE_IQ1_BN`
	`5414`	`+ // && type != GGML_TYPE_IQ2_TN && type != GGML_TYPE_IQ1_TN &&`
	`5415`	`+ && type != GGML_TYPE_IQ2_KS && type != GGML_TYPE_IQ4_KS && type != GGML_TYPE_IQ4_KSS && type != GGML_TYPE_IQ3_KS && type != GGML_TYPE_IQ5_KS && type != GGML_TYPE_IQ2_KT && type != GGML_TYPE_IQ3_KT && type != GGML_TYPE_IQ4_KT && nbytes % ggml_type_size(type) != 0) {`
`5415`	`5416`	`fprintf(stderr, "%s: invalid size %zu for type %s (type size = %zu)\n", __func__, nbytes, ggml_type_name(type), ggml_type_size(type));`
`5416`	`5417`	`return false;`
`5417`	`5418`	`}`