Convergence LCPP/IKL and cleanup part 2

Nexesenex · Nexesenex · commit 7c5f4449691d · 2025-06-22T04:01:18.000+02:00
diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.c b/ggml/src/ggml-cpu/ggml-cpu-quants.c
@@ -7,6 +7,8 @@
 #include "ggml-cpu-impl.h"
 #include "ggml-cpu.h"
 
+#include "iqk_croco/iqk_quantize_croco.h"
+
 #include <math.h>
 #include <string.h>
 #include <assert.h>
diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu
@@ -1,6 +1,8 @@
 #include "convert.cuh"
 #include "dequantize.cuh"
 
+#include <cstdint>
+
 #define CUDA_Q8_0_NE_ALIGN 2048
 
 #define UNUSED GGML_UNUSED
@@ -1464,10 +1466,10 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
             return dequantize_row_iq1_s_cuda;
         case GGML_TYPE_IQ1_M:
             return dequantize_row_iq1_m_cuda;
-        // case GGML_TYPE_IQ1_BN:
-            // return dequantize_row_iq1_bn_cuda;
-        // case GGML_TYPE_IQ2_BN:
-            // return dequantize_row_iq2_bn_cuda;
+        case GGML_TYPE_IQ1_BN:
+            return dequantize_row_iq1_bn_cuda;
+        case GGML_TYPE_IQ2_BN:
+            return dequantize_row_iq2_bn_cuda;
         case GGML_TYPE_IQ4_NL:
             return dequantize_row_iq4_nl_cuda;
         case GGML_TYPE_IQ4_XS:
@@ -1548,10 +1550,10 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
             return dequantize_row_iq1_s_cuda;
         case GGML_TYPE_IQ1_M:
             return dequantize_row_iq1_m_cuda;
-        // case GGML_TYPE_IQ1_BN:
-            // return dequantize_row_iq1_bn_cuda;
-        // case GGML_TYPE_IQ2_BN:
-            // return dequantize_row_iq2_bn_cuda;
+        case GGML_TYPE_IQ1_BN:
+            return dequantize_row_iq1_bn_cuda;
+        case GGML_TYPE_IQ2_BN:
+            return dequantize_row_iq2_bn_cuda;
         case GGML_TYPE_IQ4_NL:
             return dequantize_row_iq4_nl_cuda;
         case GGML_TYPE_IQ4_XS:
diff --git a/ggml/src/ggml-cuda/fattn-tile-f16.cu b/ggml/src/ggml-cuda/fattn-tile-f16.cu
@@ -329,7 +329,7 @@ void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_ten
     const ggml_tensor * Q   = dst->src[0];
 
     const int32_t precision = KQV->op_params[3];
-    // GGML_ASSERT_CONTINUE(precision == GGML_PREC_DEFAULT);
+    GGML_ASSERT_CONTINUE(precision == GGML_PREC_DEFAULT);
 
     float logit_softcap;
     memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
diff --git a/ggml/src/ggml-cuda/fattn-vec-f16.cuh b/ggml/src/ggml-cuda/fattn-vec-f16.cuh
@@ -370,7 +370,7 @@ void ggml_cuda_flash_attn_ext_vec_f16_case(ggml_backend_cuda_context & ctx, ggml
     const ggml_tensor * V   = dst->src[2];
 
     const int32_t precision = KQV->op_params[3];
-    // GGML_ASSERT_CONTINUE(precision == GGML_PREC_DEFAULT);
+    GGML_ASSERT_CONTINUE(precision == GGML_PREC_DEFAULT);
 
     GGML_ASSERT(K->type == type_K);
     GGML_ASSERT(V->type == type_V);
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -466,6 +466,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
 #if defined(GGML_USE_HIP)
             mappings.push_back({start_ptr, reserve_size});
 #endif
+
             // the memory allocation handle is no longer needed after mapping
             CU_CHECK(cuMemRelease(handle));
 
diff --git a/ggml/src/ggml-cuda/iqk_mmvq.cu b/ggml/src/ggml-cuda/iqk_mmvq.cu
@@ -919,3 +919,4 @@ void mul_mat_vec_iq2_bn_q8_1_cuda(
 
     iqk_mul_mat_vec_q_cuda<GGML_TYPE_IQ2_BN, 1, vec_dot_iq2_bn_q8_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
+
diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh
@@ -216,7 +216,6 @@ namespace ggml_cuda_mma {
 
     static __device__ __forceinline__ void mma(
             tile<16, 8, int> & D, const tile<16, 4, int> & A, const tile<8, 4, int> & B) {
-
 #ifdef NEW_MMA_AVAILABLE
 #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
         asm("mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5}, {%6}, {%0, %1, %2, %3};"
@@ -334,7 +333,6 @@ namespace ggml_cuda_mma {
 
     static __device__ __forceinline__ void mma(
             tile<16, 8, float> & D, const tile<16, 8, half2> & A, const tile<8, 8, half2> & B) {
-
 #ifdef NEW_MMA_AVAILABLE
         const int * Axi = (const int *) A.x;
         const int * Bxi = (const int *) B.x;
diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
@@ -1,4 +1,7 @@
 #include "mmq.cuh"
+#include "quantize.cuh"
+
+#include <vector>
 
 void ggml_cuda_op_mul_mat_q(
     ggml_backend_cuda_context & ctx,
diff --git a/ggml/src/ggml-cuda/template-instances/generate_cu_files.py b/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
@@ -27,7 +27,8 @@
     "GGML_TYPE_IQ2_XXS", "GGML_TYPE_IQ2_XS", "GGML_TYPE_IQ2_S", "GGML_TYPE_IQ3_XXS", "GGML_TYPE_IQ3_S",
     "GGML_TYPE_IQ1_S", "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS", "GGML_TYPE_Q6_0",
     "GGML_TYPE_IQ2_KS", "GGML_TYPE_IQ4_KS", "GGML_TYPE_IQ5_KS",
-    "GGML_TYPE_IQ2_K", "GGML_TYPE_IQ3_K", "GGML_TYPE_IQ4_K", "GGML_TYPE_IQ5_K", "GGML_TYPE_IQ6_K"
+    "GGML_TYPE_IQ2_K",
+    "GGML_TYPE_IQ3_K", "GGML_TYPE_IQ4_K", "GGML_TYPE_IQ5_K", "GGML_TYPE_IQ6_K"
 ]
 
 SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
diff --git a/ggml/src/ggml-cuda/vendors/cuda.h b/ggml/src/ggml-cuda/vendors/cuda.h
@@ -5,7 +5,6 @@
 #include <cublas_v2.h>
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
-#include <cuda_bf16.h>
 
 #if CUDART_VERSION < 11020
 #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
@@ -4,7 +4,6 @@
 #include <hip/hip_runtime.h>
 #include <hipblas/hipblas.h>
 #include <hip/hip_fp16.h>
-// #include <hip/hip_bf16.h>
 #include <hip/hip_bfloat16.h>
 #ifdef __HIP_PLATFORM_AMD__
 // for rocblas_initialize()
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
@@ -6,6 +6,8 @@
 #include "ggml-cpu/ggml-cpu-impl.h"
 #include "ggml-cpu.h"
 
+#include "iqk_croco/iqk_quantize_croco.h"
+
 #include <math.h>
 #include <string.h>
 #include <assert.h>
diff --git a/ggml/src/iqk_croco/iqk_common_croco.h b/ggml/src/iqk_croco/iqk_common_croco.h
@@ -17,6 +17,7 @@
 
 #include "ggml-impl.h"
 #include "ggml-quants.h"
+#include "ggml-cpu/ggml-cpu-quants.h"
 #include "iqk_mul_mat_croco.h"
 #include "iqk_quantize_croco.h"
 
diff --git a/ggml/src/iqk_croco/iqk_quantize_croco.cpp b/ggml/src/iqk_croco/iqk_quantize_croco.cpp
@@ -16,7 +16,7 @@
 #include "iqk_quantize_croco.h"
 #include "iqk_config_croco.h"
 
-#include "ggml-cpu-quants.h"
+#include "ggml-cpu/ggml-cpu-quants.h"
 #include "ggml-cpu/ggml-cpu-impl.h"
 #include "ggml-cpu.h"
 
diff --git a/ggml/src/iqk_croco/iqk_quantize_croco.h b/ggml/src/iqk_croco/iqk_quantize_croco.h
diff --git a/tools/main/main.cpp b/tools/main/main.cpp

Original file line number	Diff line number	Diff line change
`@@ -919,3 +919,4 @@ void mul_mat_vec_iq2_bn_q8_1_cuda(`
`919`	`919`
`920`	`920`	`iqk_mul_mat_vec_q_cuda<GGML_TYPE_IQ2_BN, 1, vec_dot_iq2_bn_q8_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);`
`921`	`921`	`}`
	`922`	`+`
Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,8 @@`
`27`	`27`	`"GGML_TYPE_IQ2_XXS", "GGML_TYPE_IQ2_XS", "GGML_TYPE_IQ2_S", "GGML_TYPE_IQ3_XXS", "GGML_TYPE_IQ3_S",`
`28`	`28`	`"GGML_TYPE_IQ1_S", "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS", "GGML_TYPE_Q6_0",`
`29`	`29`	`"GGML_TYPE_IQ2_KS", "GGML_TYPE_IQ4_KS", "GGML_TYPE_IQ5_KS",`
`30`		`- "GGML_TYPE_IQ2_K", "GGML_TYPE_IQ3_K", "GGML_TYPE_IQ4_K", "GGML_TYPE_IQ5_K", "GGML_TYPE_IQ6_K"`
	`30`	`+ "GGML_TYPE_IQ2_K",`
	`31`	`+ "GGML_TYPE_IQ3_K", "GGML_TYPE_IQ4_K", "GGML_TYPE_IQ5_K", "GGML_TYPE_IQ6_K"`
`31`	`32`	`]`
`32`	`33`
`33`	`34`	`SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.`